From f6a5cb162b41c9ecba246d38842e662c6d31a548 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Jul 2021 16:40:43 +0200
Subject: [PATCH 01/52] Renamed Reduction.h to reduce.h

The file should be named after the main function which is implemented in
it. Also changed the parameter name from "reduce" to "reduction" to
differentiate it from the main "reduce" function.
---
 .../ReductionAndScan/ComparisonExample.cpp    |  2 +-
 .../ReductionAndScan/MapReduceExample-1.cpp   |  2 +-
 .../ReductionAndScan/MapReduceExample-2.cpp   |  2 +-
 .../ReductionAndScan/MapReduceExample-3.cpp   |  2 +-
 .../ReductionAndScan/MaximumNormExample.cpp   |  2 +-
 .../ReductionAndScan/ProductExample.cpp       |  2 +-
 .../ReductionWithArgument.cpp                 |  2 +-
 .../ReductionWithArgumentWithFunctional.cpp   |  2 +-
 .../ReductionAndScan/ScalarProductExample.cpp |  2 +-
 .../ScalarProductWithFunctionalExample.cpp    |  2 +-
 .../Tutorials/ReductionAndScan/SumExample.cpp |  2 +-
 .../SumExampleWithFunctional.cpp              |  2 +-
 .../UpdateAndResidueExample.cpp               |  2 +-
 .../BLAS/CommonVectorOperations.hpp           |  2 +-
 src/TNL/Algorithms/MemoryOperationsCuda.hpp   |  2 +-
 src/TNL/Algorithms/MemoryOperationsHost.hpp   |  2 +-
 src/TNL/Algorithms/detail/Reduction.hpp       |  2 +-
 src/TNL/Algorithms/{Reduction.h => reduce.h}  | 69 ++++++++++---------
 src/TNL/Containers/Expressions/Comparison.h   |  2 +-
 .../Expressions/VerticalOperations.h          |  2 +-
 src/TNL/Matrices/SparseMatrix.hpp             |  2 -
 src/TNL/Matrices/SparseMatrixView.hpp         |  2 +-
 src/UnitTests/Algorithms/CMakeLists.txt       |  2 +-
 src/UnitTests/Algorithms/ReductionTest.cpp    |  1 -
 src/UnitTests/Algorithms/ReductionTest.cu     |  1 -
 src/UnitTests/Algorithms/reduceTest.cpp       |  1 +
 src/UnitTests/Algorithms/reduceTest.cu        |  1 +
 .../{ReductionTest.h => reduceTest.h}         |  4 +-
 src/UnitTests/Matrices/DenseMatrixTest.h      |  2 +-
 29 files changed, 61 insertions(+), 62 deletions(-)
 rename src/TNL/Algorithms/{Reduction.h => reduce.h} (79%)
 delete mode 100644 src/UnitTests/Algorithms/ReductionTest.cpp
 delete mode 100644 src/UnitTests/Algorithms/ReductionTest.cu
 create mode 100644 src/UnitTests/Algorithms/reduceTest.cpp
 create mode 100644 src/UnitTests/Algorithms/reduceTest.cu
 rename src/UnitTests/Algorithms/{ReductionTest.h => reduceTest.h} (98%)
diff --git a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
index 8972af7f4..3279fa377 100644
--- a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
index ff02f9c86..8d1527aaa 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
index 065f4608a..c0cdb7e21 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Timer.h>
 
 using namespace TNL;
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
index f3c54f6b0..0b93682c1 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Timer.h>
 
 using namespace TNL;
diff --git a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
index c9a5926ad..b79042db6 100644
--- a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
index 389ecd497..ace350b39 100644
--- a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
index 79a82c733..d7dba9594 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
index 7b084db0e..e5d24ab43 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
index 2dd84aa03..a44410185 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
index 4838f5f77..df9b30206 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
index cfa6e1bef..278ade2e5 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
index 9ef7795cd..4197436e1 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
index a2ccb8189..bb8f20d2c 100644
--- a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index 72c1f344d..a8d0457fc 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include "CommonVectorOperations.h"
 
 namespace TNL {
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 626847eba..b5db72b2a 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -17,7 +17,7 @@
 #include <TNL/Algorithms/MemoryOperations.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index abebd9d15..dc5aa9b24 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -16,7 +16,7 @@
 
 #include <TNL/Algorithms/MemoryOperations.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Algorithms {
diff --git a/src/TNL/Algorithms/detail/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp
index 0d1c8231f..945d6baa1 100644
--- a/src/TNL/Algorithms/detail/Reduction.hpp
+++ b/src/TNL/Algorithms/detail/Reduction.hpp
@@ -16,7 +16,7 @@
 
 //#define CUDA_REDUCTION_PROFILING
 
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/detail/Reduction.h>
 #include <TNL/Algorithms/detail/CudaReductionKernel.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/reduce.h
similarity index 79%
rename from src/TNL/Algorithms/Reduction.h
rename to src/TNL/Algorithms/reduce.h
index da6bca882..875e84319 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          Reduction.h  -  description
+                          reduce.h  -  description
                              -------------------
     begin                : Oct 28, 2010
     copyright            : (C) 2010 by Tomas Oberhuber et al.
@@ -21,10 +21,10 @@
 #include <TNL/Algorithms/detail/Reduction.h>
 
 namespace TNL {
-   namespace Algorithms {
+namespace Algorithms {
 
 /**
- * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
+ * \brief \e reduce implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
  *
  * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
  * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
@@ -35,14 +35,14 @@ namespace TNL {
  * \tparam Index is a type for indexing.
  * \tparam Result is a type of the reduction result.
  * \tparam Fetch is a lambda function for fetching the input data.
- * \tparam Reduce is a lambda function performing the reduction.
+ * \tparam Reduction is a lambda function performing the reduction.
  *
  * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation.
+ * \param reduction is a lambda function defining the reduction operation.
  * \param zero is the idempotent element for the reduction operation, i.e. element which
  *             does not change the result of the reduction.
  * \return result of the reduction
@@ -53,10 +53,10 @@ namespace TNL {
  * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
  * ```
  *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
  *
  * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
  * ```
  *
  * \par Example
@@ -71,14 +71,14 @@ template< typename Device,
           typename Index,
           typename Result,
           typename Fetch,
-          typename Reduce >
+          typename Reduction >
 Result reduce( const Index begin,
                const Index end,
                Fetch&& fetch,
-               Reduce&& reduce,
+               Reduction&& reduction,
                const Result& zero )
 {
-    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
+    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduction >( reduction ), zero );
 }
 
 /**
@@ -87,17 +87,17 @@ Result reduce( const Index begin,
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
  * \tparam Fetch is a lambda function for fetching the input data.
- * \tparam Reduce is a functional performing the reduction.
+ * \tparam Reduction is a functional performing the reduction.
  *
  * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
- * \e Reduce can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
+ * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
  *    \ref TNL::LogicalOr, \ref TNL::BitAnd or \ref TNL::BitOr.
  *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation.
+ * \param reduction is a lambda function defining the reduction operation.
  * \return result of the reduction
  *
  * The `fetch` lambda function takes one argument which is index of the element to be fetched:
@@ -117,18 +117,18 @@ Result reduce( const Index begin,
 template< typename Device,
           typename Index,
           typename Fetch,
-          typename Reduce >
+          typename Reduction >
 auto reduce( const Index begin,
              const Index end,
              Fetch&& fetch,
-             Reduce&& reduce )
+             Reduction&& reduction )
 {
    using Result = decltype( fetch( ( Index ) 0 ) );
    return detail::Reduction< Device >::reduce( begin,
                                                end,
                                                std::forward< Fetch >( fetch ),
-                                               std::forward< Reduce >( reduce ),
-                                               reduce.template getIdempotent< Result >() );
+                                               std::forward< Reduction >( reduction ),
+                                               reduction.template getIdempotent< Result >() );
 }
 
 /**
@@ -141,7 +141,7 @@ auto reduce( const Index begin,
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
  * \tparam Result is a type of the reduction result.
- * \tparam Reduce is a lambda function performing the reduction.
+ * \tparam Reduction is a lambda function performing the reduction.
  * \tparam Fetch is a lambda function for fetching the input data.
  *
  * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
@@ -149,7 +149,7 @@ auto reduce( const Index begin,
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
  * \param zero is the idempotent element for the reduction operation, i.e. element which
  *             does not change the result of the reduction.
  * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
@@ -161,10 +161,10 @@ auto reduce( const Index begin,
  * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
  * ```
  *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
  *
  * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
  * ```
  *
  * \par Example
@@ -179,18 +179,18 @@ template< typename Device,
           typename Index,
           typename Result,
           typename Fetch,
-          typename Reduce >
+          typename Reduction >
 std::pair< Result, Index >
 reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
-                    Reduce&& reduce,
+                    Reduction&& reduction,
                     const Result& zero )
 {
     return detail::Reduction< Device >::reduceWithArgument( begin,
                                                             end,
                                                             std::forward< Fetch >( fetch ),
-                                                            std::forward< Reduce >( reduce ),
+                                                            std::forward< Reduction >( reduction ),
                                                             zero );
 }
 
@@ -204,16 +204,17 @@ reduceWithArgument( const Index begin,
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
  * \tparam Result is a type of the reduction result.
- * \tparam Reduce is a functional performing the reduction.
+ * \tparam Reduction is a functional performing the reduction.
  * \tparam Fetch is a lambda function for fetching the input data.
  *
  * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
- * \e Reduce can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ * \e Reduction can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
  * \param zero is the idempotent element for the reduction operation, i.e. element which
  *             does not change the result of the reduction.
  * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
@@ -225,10 +226,10 @@ reduceWithArgument( const Index begin,
  * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
  * ```
  *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
  *
  * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
  * ```
  *
  * \par Example
@@ -242,20 +243,20 @@ reduceWithArgument( const Index begin,
 template< typename Device,
           typename Index,
           typename Fetch,
-          typename Reduce >
+          typename Reduction >
 auto
 reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
-                    Reduce&& reduce )
+                    Reduction&& reduction )
 {
    using Result = decltype( fetch( ( Index ) 0 ) );
    return detail::Reduction< Device >::reduceWithArgument( begin,
                                                            end,
                                                            std::forward< Fetch >( fetch ),
-                                                           std::forward< Reduce >( reduce ),
-                                                           reduce.template getIdempotent< Result >() );
+                                                           std::forward< Reduction >( reduction ),
+                                                           reduction.template getIdempotent< Result >() );
 }
 
-   } // namespace Algorithms
+} // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index 65f299120..144750eb5 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index ff094e4ea..704c7f53d 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -13,7 +13,7 @@
 #include <limits>
 #include <type_traits>
 
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 
 ////
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 6f701a3ea..21bf6d143 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -10,9 +10,7 @@
 
 #pragma once
 
-#include <functional>
 #include <sstream>
-#include <TNL/Algorithms/Reduction.h>
 #include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 02ef757c2..ad2da0d4b 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -12,7 +12,7 @@
 
 #include <functional>
 #include <TNL/Matrices/SparseMatrixView.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Matrices/details/SparseMatrix.h>
 
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 31028036b..a9a5db9ce 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -5,7 +5,7 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
-         ReductionTest
+         reduceTest
          staticForTest
          unrolledForTest
 )
diff --git a/src/UnitTests/Algorithms/ReductionTest.cpp b/src/UnitTests/Algorithms/ReductionTest.cpp
deleted file mode 100644
index 4d630e5f9..000000000
--- a/src/UnitTests/Algorithms/ReductionTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.cu b/src/UnitTests/Algorithms/ReductionTest.cu
deleted file mode 100644
index 4d630e5f9..000000000
--- a/src/UnitTests/Algorithms/ReductionTest.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/reduceTest.cpp b/src/UnitTests/Algorithms/reduceTest.cpp
new file mode 100644
index 000000000..4e9927262
--- /dev/null
+++ b/src/UnitTests/Algorithms/reduceTest.cpp
@@ -0,0 +1 @@
+#include "reduceTest.h"
diff --git a/src/UnitTests/Algorithms/reduceTest.cu b/src/UnitTests/Algorithms/reduceTest.cu
new file mode 100644
index 000000000..4e9927262
--- /dev/null
+++ b/src/UnitTests/Algorithms/reduceTest.cu
@@ -0,0 +1 @@
+#include "reduceTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/reduceTest.h
similarity index 98%
rename from src/UnitTests/Algorithms/ReductionTest.h
rename to src/UnitTests/Algorithms/reduceTest.h
index b880642b8..6b1565d71 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/reduceTest.h
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          ReductionTest.h  -  description
+                          reduceTest.h  -  description
                              -------------------
     begin                : Jul 2, 2021
     copyright            : (C) 2021 by Tomas Oberhuber et al.
@@ -13,7 +13,7 @@
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Containers/Array.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 #ifdef HAVE_GTEST
 #include <gtest/gtest.h>
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index ef7d077a5..ceb7ae358 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -17,7 +17,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Math.h>
 
 using Dense_host_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int >;
-- 
GitLab


From dfe6b1e8d1c213b89f12877dd9f60f0da7bf5c81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Jul 2021 21:24:51 +0200
Subject: [PATCH 02/52] Refactoring scan

- used ValueType instead of RealType - closes #87
- replaced prefix-sum with scan in the comments
- renamed variables containing "sum" to "result"
- fixed artificial blockShifts in the sequential implementation
---
 src/TNL/Algorithms/DistributedScan.h       |  26 ++---
 src/TNL/Algorithms/Scan.h                  |  24 ++---
 src/TNL/Algorithms/Scan.hpp                | 108 ++++++++++-----------
 src/TNL/Algorithms/detail/CudaScanKernel.h |  40 ++++----
 4 files changed, 97 insertions(+), 101 deletions(-)

diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h
index aa7c008a7..d6e60949c 100644
--- a/src/TNL/Algorithms/DistributedScan.h
+++ b/src/TNL/Algorithms/DistributedScan.h
@@ -29,9 +29,9 @@ struct DistributedScan
             typename DistributedVector::IndexType begin,
             typename DistributedVector::IndexType end,
             const Reduction& reduction,
-            const typename DistributedVector::RealType zero )
+            const typename DistributedVector::ValueType zero )
    {
-      using RealType = typename DistributedVector::RealType;
+      using ValueType = typename DistributedVector::ValueType;
       using DeviceType = typename DistributedVector::DeviceType;
 
       const auto group = v.getCommunicationGroup();
@@ -43,23 +43,23 @@ struct DistributedScan
 
          // perform first phase on the local data
          auto localView = v.getLocalView();
-         const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
-         const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
+         const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
+         const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );
 
-         // exchange local sums between ranks
+         // exchange local results between ranks
          const int nproc = MPI::GetSize( group );
-         RealType dataForScatter[ nproc ];
-         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
-         Containers::Vector< RealType, Devices::Host > rankSums( nproc );
+         ValueType dataForScatter[ nproc ];
+         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result;
+         Containers::Vector< ValueType, Devices::Host > rank_results( nproc );
          // NOTE: exchanging general data types does not work with MPI
-         MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
+         MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
 
-         // compute the scan of the per-rank sums
-         Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
+         // compute the scan of the per-rank results
+         Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero );
 
-         // perform second phase: shift by the per-block and per-rank offsets
+         // perform the second phase, using the per-block and per-rank results
          const int rank = MPI::GetRank( group );
-         Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
+         Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] );
       }
    }
 };
diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h
index 81a5d2f7e..f4dd599cf 100644
--- a/src/TNL/Algorithms/Scan.h
+++ b/src/TNL/Algorithms/Scan.h
@@ -134,7 +134,7 @@ struct Scan< Devices::Sequential, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename Reduction >
@@ -143,7 +143,7 @@ struct Scan< Devices::Sequential, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
-                      const typename Vector::RealType zero );
+                      const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename BlockShifts,
@@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::RealType shift );
+                       const typename Vector::ValueType shift );
 };
 
 template< ScanType Type >
@@ -194,7 +194,7 @@ struct Scan< Devices::Host, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename Reduction >
@@ -203,7 +203,7 @@ struct Scan< Devices::Host, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
-                      const typename Vector::RealType zero );
+                      const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename BlockShifts,
@@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::RealType shift );
+                       const typename Vector::ValueType shift );
 };
 
 template< ScanType Type >
@@ -254,7 +254,7 @@ struct Scan< Devices::Cuda, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename Reduction >
@@ -263,7 +263,7 @@ struct Scan< Devices::Cuda, Type >
                       const typename Vector::IndexType begin,
                       const typename Vector::IndexType end,
                       const Reduction& reduction,
-                      const typename Vector::RealType zero );
+                      const typename Vector::ValueType zero );
 
    template< typename Vector,
              typename BlockShifts,
@@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::RealType shift );
+                       const typename Vector::ValueType shift );
 };
 
 template< ScanType Type >
@@ -318,7 +318,7 @@ struct SegmentedScan< Devices::Sequential, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 };
 
 template< ScanType Type >
@@ -362,7 +362,7 @@ struct SegmentedScan< Devices::Host, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 };
 
 template< ScanType Type >
@@ -408,7 +408,7 @@ struct SegmentedScan< Devices::Cuda, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::RealType zero );
+            const typename Vector::ValueType zero );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 78d5eaf60..20e684a27 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -33,9 +33,9 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
-   // sequential prefix-sum does not need a second phase
+   // sequential scan does not need a second phase
    performFirstPhase( v, begin, end, reduction, zero );
 }
 
@@ -48,33 +48,31 @@ performFirstPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
-                   const typename Vector::RealType zero )
+                   const typename Vector::ValueType zero )
 {
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
-//   return Containers::StaticArray< 1, RealType > block_sums;
-   Containers::Array< RealType, Devices::Host > block_sums( 1 );
-   block_sums[ 0 ] = zero;
-
    if( Type == ScanType::Inclusive ) {
       for( IndexType i = begin + 1; i < end; i++ )
          v[ i ] = reduction( v[ i ], v[ i - 1 ] );
-      block_sums[ 0 ] = v[ end - 1 ];
    }
-   else // Exclusive prefix sum
+   else // Exclusive scan
    {
-      RealType aux = zero;
+      ValueType aux = zero;
       for( IndexType i = begin; i < end; i++ ) {
-         const RealType x = v[ i ];
+         const ValueType x = v[ i ];
          v[ i ] = aux;
          aux = reduction( aux, x );
       }
-      block_sums[ 0 ] = aux;
    }
 
-   return block_sums;
+   // sequential scan = one block, so the exclusive scan is trivially [zero]
+   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
+//   Containers::StaticArray< 1, ValueType > block_results;
+   Containers::Array< ValueType, Devices::Host > block_results( 1 );
+   block_results[ 0 ] = zero;
+   return block_results;
 }
 
 template< ScanType Type >
@@ -88,7 +86,7 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::RealType shift )
+                    const typename Vector::ValueType shift )
 {
    using IndexType = typename Vector::IndexType;
 
@@ -105,7 +103,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
 #ifdef HAVE_OPENMP
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) {
@@ -128,50 +126,50 @@ performFirstPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
-                   const typename Vector::RealType zero )
+                   const typename Vector::ValueType zero )
 {
 #ifdef HAVE_OPENMP
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
    const int threads = Devices::Host::getMaxThreadsCount();
-   Containers::Array< RealType > block_sums( threads + 1 );
-   block_sums[ 0 ] = zero;
+   Containers::Array< ValueType > block_results( threads + 1 );
 
    #pragma omp parallel num_threads(threads)
    {
       // init
       const int thread_idx = omp_get_thread_num();
-      RealType block_sum = zero;
+      ValueType block_result = zero;
 
-      // perform prefix-sum on blocks statically assigned to threads
+      // perform scan on blocks statically assigned to threads
       if( Type == ScanType::Inclusive ) {
          #pragma omp for schedule(static)
          for( IndexType i = begin; i < end; i++ ) {
-            block_sum = reduction( block_sum, v[ i ] );
-            v[ i ] = block_sum;
+            block_result = reduction( block_result, v[ i ] );
+            v[ i ] = block_result;
          }
       }
       else {
          #pragma omp for schedule(static)
          for( IndexType i = begin; i < end; i++ ) {
-            const RealType x = v[ i ];
-            v[ i ] = block_sum;
-            block_sum = reduction( block_sum, x );
+            const ValueType x = v[ i ];
+            v[ i ] = block_result;
+            block_result = reduction( block_result, x );
          }
       }
 
-      // write the block sums into the buffer
-      block_sums[ thread_idx + 1 ] = block_sum;
+      // write the block result into the buffer
+      block_results[ thread_idx + 1 ] = block_result;
    }
 
-   // block_sums now contains sums of numbers in each block. The first phase
-   // ends by computing prefix-sum of this array.
+   // block_results now contains scan results for each block. The first phase
+   // ends by computing an exclusive scan of this array.
+   block_results[ 0 ] = zero;
    for( int i = 1; i < threads + 1; i++ )
-      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
+      block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] );
 
-   // block_sums now contains shift values for each block - to be used in the second phase
-   return block_sums;
+   // block_results now contains shift values for each block - to be used in the second phase
+   return block_results;
 #else
    return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
 #endif
@@ -188,10 +186,10 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::RealType shift )
+                    const typename Vector::ValueType shift )
 {
 #ifdef HAVE_OPENMP
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
    const int threads = blockShifts.getSize() - 1;
@@ -200,7 +198,7 @@ performSecondPhase( Vector& v,
    #pragma omp parallel num_threads(threads)
    {
       const int thread_idx = omp_get_thread_num();
-      const RealType offset = reduction( blockShifts[ thread_idx ], shift );
+      const ValueType offset = reduction( blockShifts[ thread_idx ], shift );
 
       // shift intermediate results by the offset
       #pragma omp for schedule(static)
@@ -221,13 +219,13 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
+   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -247,13 +245,13 @@ performFirstPhase( Vector& v,
                    const typename Vector::IndexType begin,
                    const typename Vector::IndexType end,
                    const Reduction& reduction,
-                   const typename Vector::RealType zero )
+                   const typename Vector::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase(
       end - begin,
       &v.getData()[ begin ],  // input
       &v.getData()[ begin ],  // output
@@ -275,13 +273,13 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::RealType shift )
+                    const typename Vector::ValueType shift )
 {
 #ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase(
       end - begin,
       &v.getData()[ begin ],  // output
       blockShifts.getData(),
@@ -304,9 +302,9 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
    if( Type == ScanType::Inclusive )
@@ -315,13 +313,13 @@ perform( Vector& v,
          if( ! flags[ i ] )
             v[ i ] = reduction( v[ i ], v[ i - 1 ] );
    }
-   else // Exclusive prefix sum
+   else // Exclusive scan
    {
-       RealType aux( v[ begin ] );
+      ValueType aux( v[ begin ] );
       v[ begin ] = zero;
       for( IndexType i = begin + 1; i < end; i++ )
       {
-         RealType x = v[ i ];
+         ValueType x = v[ i ];
          if( flags[ i ] )
             aux = zero;
          v[ i ] = aux;
@@ -341,7 +339,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
 #ifdef HAVE_OPENMP
    // TODO: parallelize with OpenMP
@@ -362,10 +360,10 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType zero )
+         const typename Vector::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
+   using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
    throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 63072ea89..21f51fa59 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -34,7 +34,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
                          const int elementsInBlock,
                          const Real* input,
                          Real* output,
-                         Real* auxArray )
+                         Real* blockResults )
 {
    Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
    Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
@@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
    {
       if( scanType == ScanType::Exclusive )
       {
-         auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
-                                             sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
+         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
+                                                 sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
       }
       else
-         auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
+         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
    }
 }
 
@@ -164,12 +164,12 @@ cudaSecondPhaseBlockScan( Reduction reduction,
                           const int elementsInBlock,
                           const Index gridIdx,
                           const Index maxGridSize,
-                          const Real* auxArray,
+                          const Real* blockResults,
                           Real* data,
                           Real shift )
 {
    if( gridIdx > 0 || blockIdx.x > 0 )
-      shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
+      shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
    const int readOffset = blockIdx.x * elementsInBlock;
    int readIdx = threadIdx.x;
    while( readIdx < elementsInBlock && readOffset + readIdx < size )
@@ -248,9 +248,9 @@ struct CudaScanKernelLauncher
       const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
       //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
-      // allocate array for the block sums
-      Containers::Array< Real, Devices::Cuda > blockSums;
-      blockSums.setSize( numberOfBlocks );
+      // allocate array for the block results
+      Containers::Array< Real, Devices::Cuda > blockResults;
+      blockResults.setSize( numberOfBlocks );
 
       // loop over all grids
       for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
@@ -278,20 +278,20 @@ struct CudaScanKernelLauncher
               elementsInBlock,
               &deviceInput[ gridOffset ],
               &deviceOutput[ gridOffset ],
-              &blockSums.getData()[ gridIdx * maxGridSize() ] );
+              &blockResults.getData()[ gridIdx * maxGridSize() ] );
       }
 
       // synchronize the null-stream after all grids
       cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
-      // blockSums now contains sums of numbers in each block. The first phase
-      // ends by computing prefix-sum of this array.
+      // blockResults now contains scan results for each block. The first phase
+      // ends by computing an exclusive scan of this array.
       if( numberOfBlocks > 1 ) {
          CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
-            blockSums.getSize(),
-            blockSums.getData(),
-            blockSums.getData(),
+            blockResults.getSize(),
+            blockResults.getData(),
+            blockResults.getData(),
             reduction,
             zero,
             blockSize );
@@ -301,8 +301,8 @@ struct CudaScanKernelLauncher
       // to check if we test the algorithm with more than one CUDA grid.
       gridsCount() = numberOfGrids;
 
-      // blockSums now contains shift values for each block - to be used in the second phase
-      return blockSums;
+      // blockResults now contains shift values for each block - to be used in the second phase
+      return blockResults;
    }
 
    /****
@@ -363,10 +363,8 @@ struct CudaScanKernelLauncher
       TNL_CHECK_CUDA_DEVICE;
    }
 
-   /****
-    * The following serves for setting smaller maxGridSize so that we can force
-    * the prefix sum in CUDA to run with more the one grids in unit tests.
-    */
+   // The following serves for setting smaller maxGridSize so that we can force
+   // the scan in CUDA to run with more than one grid in unit tests.
    static int& maxGridSize()
    {
       static int maxGridSize = Cuda::getMaxGridSize();
-- 
GitLab


From ee8e4e92d03f31f3a7b0199af763ae03f2c8783d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 08:25:51 +0200
Subject: [PATCH 03/52] Fixed sequential scan to apply the initial value
 properly

---
 src/TNL/Algorithms/Scan.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 20e684a27..8ccdadd6f 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -53,13 +53,13 @@ performFirstPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   ValueType aux = zero;
    if( Type == ScanType::Inclusive ) {
-      for( IndexType i = begin + 1; i < end; i++ )
-         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
+      for( IndexType i = begin; i < end; i++ )
+         v[ i ] = aux = reduction( aux, v[ i ] );
    }
    else // Exclusive scan
    {
-      ValueType aux = zero;
       for( IndexType i = begin; i < end; i++ ) {
          const ValueType x = v[ i ];
          v[ i ] = aux;
-- 
GitLab


From 311fcf3632726f0d7c82f89fe1d1b7a32f0fd6dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 12:52:30 +0200
Subject: [PATCH 04/52] Refactored splitting of the scan operation in two
 phases

- sequential scan does not need to be split, so "perform" performs the
  whole simple scan algorithm, "performFirstPhase" only reduces the
  block (i.e. the whole vector), "performSecondPhase" performs the scan
  operation with the block result combined with a global offset as the
  initial value
- parallel OpenMP scan calls the sequential scan to process the block
  results
- parallel CUDA scan was changed such that the block results array is an
  exclusive scan after the first phase, same as in the other device
  specializations
---
 src/TNL/Algorithms/Scan.h                  |  6 +--
 src/TNL/Algorithms/Scan.hpp                | 60 ++++++++++------------
 src/TNL/Algorithms/detail/CudaScanKernel.h | 11 ++--
 3 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h
index f4dd599cf..4307aee65 100644
--- a/src/TNL/Algorithms/Scan.h
+++ b/src/TNL/Algorithms/Scan.h
@@ -154,7 +154,7 @@ struct Scan< Devices::Sequential, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::ValueType shift );
+                       const typename Vector::ValueType zero );
 };
 
 template< ScanType Type >
@@ -214,7 +214,7 @@ struct Scan< Devices::Host, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::ValueType shift );
+                       const typename Vector::ValueType zero );
 };
 
 template< ScanType Type >
@@ -274,7 +274,7 @@ struct Scan< Devices::Cuda, Type >
                        const typename Vector::IndexType begin,
                        const typename Vector::IndexType end,
                        const Reduction& reduction,
-                       const typename Vector::ValueType shift );
+                       const typename Vector::ValueType zero );
 };
 
 template< ScanType Type >
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 8ccdadd6f..54780b48f 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -13,6 +13,7 @@
 #pragma once
 
 #include "Scan.h"
+#include "reduce.h"
 
 #include <TNL/Assert.h>
 #include <TNL/Containers/Array.h>
@@ -34,25 +35,11 @@ perform( Vector& v,
          const typename Vector::IndexType end,
          const Reduction& reduction,
          const typename Vector::ValueType zero )
-{
-   // sequential scan does not need a second phase
-   performFirstPhase( v, begin, end, reduction, zero );
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-auto
-Scan< Devices::Sequential, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::ValueType zero )
 {
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   // simple sequential algorithm - not split into phases
    ValueType aux = zero;
    if( Type == ScanType::Inclusive ) {
       for( IndexType i = begin; i < end; i++ )
@@ -66,12 +53,25 @@ performFirstPhase( Vector& v,
          aux = reduction( aux, x );
       }
    }
+}
 
-   // sequential scan = one block, so the exclusive scan is trivially [zero]
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+Scan< Devices::Sequential, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::ValueType zero )
+{
    // FIXME: StaticArray does not have getElement() which is used in DistributedScan
-//   Containers::StaticArray< 1, ValueType > block_results;
-   Containers::Array< ValueType, Devices::Host > block_results( 1 );
+//   Containers::StaticArray< 2, ValueType > block_results;
+   Containers::Array< typename Vector::ValueType, Devices::Sequential > block_results( 2 );
+   // artificial first phase - only reduce the block
    block_results[ 0 ] = zero;
+   block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, v, reduction, zero );
    return block_results;
 }
 
@@ -86,12 +86,10 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::ValueType shift )
+                    const typename Vector::ValueType zero )
 {
-   using IndexType = typename Vector::IndexType;
-
-   for( IndexType i = begin; i < end; i++ )
-      v[ i ] = reduction( v[ i ], shift );
+   // artificial second phase - only one block, use the shift as the initial value
+   perform( v, begin, end, reduction, reduction( zero, blockShifts[ 0 ] ) );
 }
 
 template< ScanType Type >
@@ -159,14 +157,12 @@ performFirstPhase( Vector& v,
       }
 
       // write the block result into the buffer
-      block_results[ thread_idx + 1 ] = block_result;
+      block_results[ thread_idx ] = block_result;
    }
 
    // block_results now contains scan results for each block. The first phase
    // ends by computing an exclusive scan of this array.
-   block_results[ 0 ] = zero;
-   for( int i = 1; i < threads + 1; i++ )
-      block_results[ i ] = reduction( block_results[ i ], block_results[ i - 1 ] );
+   Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, threads + 1, reduction, zero );
 
    // block_results now contains shift values for each block - to be used in the second phase
    return block_results;
@@ -186,7 +182,7 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::ValueType shift )
+                    const typename Vector::ValueType zero )
 {
 #ifdef HAVE_OPENMP
    using ValueType = typename Vector::ValueType;
@@ -198,7 +194,7 @@ performSecondPhase( Vector& v,
    #pragma omp parallel num_threads(threads)
    {
       const int thread_idx = omp_get_thread_num();
-      const ValueType offset = reduction( blockShifts[ thread_idx ], shift );
+      const ValueType offset = reduction( zero, blockShifts[ thread_idx ] );
 
       // shift intermediate results by the offset
       #pragma omp for schedule(static)
@@ -206,7 +202,7 @@ performSecondPhase( Vector& v,
          v[ i ] = reduction( v[ i ], offset );
    }
 #else
-   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, shift );
+   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, zero );
 #endif
 }
 
@@ -273,7 +269,7 @@ performSecondPhase( Vector& v,
                     const typename Vector::IndexType begin,
                     const typename Vector::IndexType end,
                     const Reduction& reduction,
-                    const typename Vector::ValueType shift )
+                    const typename Vector::ValueType zero )
 {
 #ifdef HAVE_CUDA
    using ValueType = typename Vector::ValueType;
@@ -284,7 +280,7 @@ performSecondPhase( Vector& v,
       &v.getData()[ begin ],  // output
       blockShifts.getData(),
       reduction,
-      shift );
+      zero );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 21f51fa59..ddce0c6ea 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -168,8 +168,7 @@ cudaSecondPhaseBlockScan( Reduction reduction,
                           Real* data,
                           Real shift )
 {
-   if( gridIdx > 0 || blockIdx.x > 0 )
-      shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
+   shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x ] );
    const int readOffset = blockIdx.x * elementsInBlock;
    int readIdx = threadIdx.x;
    while( readIdx < elementsInBlock && readOffset + readIdx < size )
@@ -250,7 +249,8 @@ struct CudaScanKernelLauncher
 
       // allocate array for the block results
       Containers::Array< Real, Devices::Cuda > blockResults;
-      blockResults.setSize( numberOfBlocks );
+      blockResults.setSize( numberOfBlocks + 1 );
+      blockResults.setElement( 0, zero );
 
       // loop over all grids
       for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
@@ -278,7 +278,8 @@ struct CudaScanKernelLauncher
               elementsInBlock,
               &deviceInput[ gridOffset ],
               &deviceOutput[ gridOffset ],
-              &blockResults.getData()[ gridIdx * maxGridSize() ] );
+              // blockResults are shifted by 1, because the 0-th element should stay zero
+              &blockResults.getData()[ gridIdx * maxGridSize() + 1 ] );
       }
 
       // synchronize the null-stream after all grids
@@ -288,6 +289,8 @@ struct CudaScanKernelLauncher
       // blockResults now contains scan results for each block. The first phase
       // ends by computing an exclusive scan of this array.
       if( numberOfBlocks > 1 ) {
+         // we perform an inclusive scan, but the 0-th is zero and block results
+         // were shifted by 1, so effectively we get an exclusive scan
          CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
             blockResults.getSize(),
             blockResults.getData(),
-- 
GitLab


From 62100711058b7cc69559ecc45d2bf6124b5390ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 9 Jul 2021 22:11:54 +0200
Subject: [PATCH 05/52] Removed useless SharedPointer and ParallelFor from
 ArrayTest

The tests should not rely on other parts of the library if possible.
---
 src/UnitTests/Containers/ArrayTest.h | 50 ++++++++++++++++++----------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 1ed8052ee..15fb055a1 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -16,9 +16,6 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Pointers/DevicePointer.h>
-#include <TNL/Pointers/SharedPointer.h>
-#include <TNL/Pointers/SmartPointersRegister.h>
-#include <TNL/Algorithms/ParallelFor.h>
 
 #include "gtest/gtest.h"
 
@@ -405,21 +402,40 @@ TYPED_TEST( ArrayTest, elementwiseAccess )
    testArrayElementwiseAccess( ArrayType() );
 }
 
-template< typename ArrayType >
-void test_setElement()
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Host, Index >& )
 {
-   Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 );
-   auto set = [=] __cuda_callable__ ( int i ) mutable {
-      a->setElement( i, i );
-      b->setElement( i, a->getElement( i ) );
-   };
-   Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >();
-   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
-   for( int i = 0; i < 10; i++ )
-   {
-      EXPECT_EQ( a->getElement( i ), i );
-      EXPECT_EQ( b->getElement( i ), i );
+}
+
+#ifdef HAVE_CUDA
+template< typename ValueType, typename IndexType >
+__global__ void test_setElement_on_device_kernel( Array< ValueType, Devices::Cuda, IndexType >* a,
+                                                  Array< ValueType, Devices::Cuda, IndexType >* b )
+{
+   if( threadIdx.x < a->getSize() ) {
+      a->setElement( threadIdx.x, threadIdx.x );
+      b->setElement( threadIdx.x, a->getElement( threadIdx.x ) );
+   }
+}
+#endif /* HAVE_CUDA */
+
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Cuda, Index >& )
+{
+#ifdef HAVE_CUDA
+   using ArrayType = Array< Value, Devices::Cuda, Index >;
+   ArrayType a( 10, 0 ), b( 10, 0 );
+   Pointers::DevicePointer< ArrayType > kernel_a( a );
+   Pointers::DevicePointer< ArrayType > kernel_b( b );
+   test_setElement_on_device_kernel<<< 1, 16 >>>( &kernel_a.template modifyData< Devices::Cuda >(),
+                                                  &kernel_b.template modifyData< Devices::Cuda >() );
+   cudaDeviceSynchronize();
+   TNL_CHECK_CUDA_DEVICE;
+   for( int i = 0; i < 10; i++ ) {
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
    }
+#endif
 }
 
 TYPED_TEST( ArrayTest, setElement )
@@ -433,7 +449,7 @@ TYPED_TEST( ArrayTest, setElement )
    for( int i = 0; i < 10; i++ )
       EXPECT_EQ( a.getElement( i ), i );
 
-   test_setElement< ArrayType >();
+   test_setElement_on_device( a );
 }
 
 // test must be in a plain function because nvcc sucks (extended lambdas are
-- 
GitLab


From 1c9ff705cf0001c0492cabd421ca0bd7bd906b59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 08:49:50 +0200
Subject: [PATCH 06/52] Added Devices::Sequential to ArrayTest and
 ArrayViewTest

---
 src/UnitTests/Containers/ArrayTest.h     | 36 +++++++++++++++++++++++
 src/UnitTests/Containers/ArrayViewTest.h | 37 ++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 15fb055a1..feae975e4 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -64,6 +64,23 @@ protected:
 // types for which ArrayTest is instantiated
 using ArrayTypes = ::testing::Types<
 #ifndef HAVE_CUDA
+   // we can't test all types because the argument list would be too long...
+//   Array< int,    Devices::Sequential, short >,
+//   Array< long,   Devices::Sequential, short >,
+//   Array< float,  Devices::Sequential, short >,
+//   Array< double, Devices::Sequential, short >,
+//   Array< MyData, Devices::Sequential, short >,
+//   Array< int,    Devices::Sequential, int >,
+//   Array< long,   Devices::Sequential, int >,
+//   Array< float,  Devices::Sequential, int >,
+//   Array< double, Devices::Sequential, int >,
+//   Array< MyData, Devices::Sequential, int >,
+   Array< int,    Devices::Sequential, long >,
+   Array< long,   Devices::Sequential, long >,
+   Array< float,  Devices::Sequential, long >,
+   Array< double, Devices::Sequential, long >,
+   Array< MyData, Devices::Sequential, long >,
+
    Array< int,    Devices::Host, short >,
    Array< long,   Devices::Host, short >,
    Array< float,  Devices::Host, short >,
@@ -102,6 +119,8 @@ using ArrayTypes = ::testing::Types<
    // (but we can't test all types because the argument list would be too long...)
 #ifndef HAVE_CUDA
    ,
+   Vector< float,  Devices::Sequential, long >,
+   Vector< double, Devices::Sequential, long >,
    Vector< float,  Devices::Host, long >,
    Vector< double, Devices::Host, long >
 #endif
@@ -358,6 +377,18 @@ TYPED_TEST( ArrayTest, reset )
    EXPECT_EQ( u.getData(), nullptr );
 }
 
+template< typename Value, typename Index >
+void testArrayElementwiseAccess( Array< Value, Devices::Sequential, Index >&& u )
+{
+   u.setSize( 10 );
+   for( int i = 0; i < 10; i++ ) {
+      u.setElement( i, i );
+      EXPECT_EQ( u.getData()[ i ], i );
+      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( u[ i ], i );
+   }
+}
+
 template< typename Value, typename Index >
 void testArrayElementwiseAccess( Array< Value, Devices::Host, Index >&& u )
 {
@@ -402,6 +433,11 @@ TYPED_TEST( ArrayTest, elementwiseAccess )
    testArrayElementwiseAccess( ArrayType() );
 }
 
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Sequential, Index >& )
+{
+}
+
 template< typename Value, typename Index >
 void test_setElement_on_device( const Array< Value, Devices::Host, Index >& )
 {
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index d620b8bbb..48274181e 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -60,7 +60,24 @@ protected:
 // types for which ArrayViewTest is instantiated
 using ViewTypes = ::testing::Types<
 #ifndef HAVE_CUDA
-    ArrayView< int,    Devices::Host, short >
+   // we can't test all types because the argument list would be too long...
+//    ArrayView< int,    Devices::Sequential, short >
+//   ,ArrayView< long,   Devices::Sequential, short >
+//   ,ArrayView< float,  Devices::Sequential, short >
+//   ,ArrayView< double, Devices::Sequential, short >
+//   ,ArrayView< MyData, Devices::Sequential, short >
+//   ,ArrayView< int,    Devices::Sequential, int >
+//   ,ArrayView< long,   Devices::Sequential, int >
+//   ,ArrayView< float,  Devices::Sequential, int >
+//   ,ArrayView< double, Devices::Sequential, int >
+//   ,ArrayView< MyData, Devices::Sequential, int >
+    ArrayView< int,    Devices::Sequential, long >
+   ,ArrayView< long,   Devices::Sequential, long >
+   ,ArrayView< float,  Devices::Sequential, long >
+   ,ArrayView< double, Devices::Sequential, long >
+   ,ArrayView< MyData, Devices::Sequential, long >
+
+   ,ArrayView< int,    Devices::Host, short >
    ,ArrayView< long,   Devices::Host, short >
    ,ArrayView< float,  Devices::Host, short >
    ,ArrayView< double, Devices::Host, short >
@@ -98,6 +115,8 @@ using ViewTypes = ::testing::Types<
    // (but we can't test all types because the argument list would be too long...)
 #ifndef HAVE_CUDA
    ,
+   VectorView< float,  Devices::Sequential, long >,
+   VectorView< double, Devices::Sequential, long >,
    VectorView< float,  Devices::Host, long >,
    VectorView< double, Devices::Host, long >
 #endif
@@ -218,6 +237,20 @@ TYPED_TEST( ArrayViewTest, reset )
    EXPECT_EQ( u.getData(), nullptr );
 }
 
+template< typename Value, typename Index >
+void testArrayViewElementwiseAccess( Array< Value, Devices::Sequential, Index >&& a )
+{
+   a.setSize( 10 );
+   using ViewType = ArrayView< Value, Devices::Sequential, Index >;
+   ViewType u( a );
+   for( int i = 0; i < 10; i++ ) {
+      u.setElement( i, i );
+      EXPECT_EQ( u.getData()[ i ], i );
+      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( u[ i ], i );
+   }
+}
+
 template< typename Value, typename Index >
 void testArrayViewElementwiseAccess( Array< Value, Devices::Host, Index >&& a )
 {
@@ -274,7 +307,7 @@ void ArrayViewEvaluateTest( ArrayType& u )
    ViewType v( u );
 
    v.forAllElements( [] __cuda_callable__ ( IndexType i, ValueType& value ) { value = 3 * i % 4; } );
-   
+
    for( int i = 0; i < 10; i++ )
    {
       EXPECT_EQ( u.getElement( i ), 3 * i % 4 );
-- 
GitLab


From 045241d7689ddf39dc2d9aee00728452bdd73833 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 08:29:57 +0200
Subject: [PATCH 07/52] Added Devices::Sequential to DistributedVectorTest and
 VectorTestSetup

---
 .../Containers/DistributedVectorTest.h        |  1 +
 src/UnitTests/Containers/VectorTestSetup.h    | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index 8dc9d6d26..24da5fbe7 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -77,6 +77,7 @@ protected:
 
 // types for which DistributedVectorTest is instantiated
 using DistributedVectorTypes = ::testing::Types<
+   DistributedVector< double, Devices::Sequential, int >,
    DistributedVector< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h
index c9863009b..7141466b8 100644
--- a/src/UnitTests/Containers/VectorTestSetup.h
+++ b/src/UnitTests/Containers/VectorTestSetup.h
@@ -37,6 +37,25 @@ protected:
 // TODO: Quad must be fixed
 using VectorTypes = ::testing::Types<
 #ifndef HAVE_CUDA
+   Vector< int,            Devices::Sequential, short >,
+   Vector< long,           Devices::Sequential, short >,
+   Vector< float,          Devices::Sequential, short >,
+   Vector< double,         Devices::Sequential, short >,
+   //Vector< Quad< float >,  Devices::Sequential, short >,
+   //Vector< Quad< double >, Devices::Sequential, short >,
+   Vector< int,            Devices::Sequential, int >,
+   Vector< long,           Devices::Sequential, int >,
+   Vector< float,          Devices::Sequential, int >,
+   Vector< double,         Devices::Sequential, int >,
+   //Vector< Quad< float >,  Devices::Sequential, int >,
+   //Vector< Quad< double >, Devices::Sequential, int >,
+   Vector< int,            Devices::Sequential, long >,
+   Vector< long,           Devices::Sequential, long >,
+   Vector< float,          Devices::Sequential, long >,
+   Vector< double,         Devices::Sequential, long >,
+   //Vector< Quad< float >,  Devices::Sequential, long >,
+   //Vector< Quad< double >, Devices::Sequential, long >,
+
    Vector< int,            Devices::Host, short >,
    Vector< long,           Devices::Host, short >,
    Vector< float,          Devices::Host, short >,
-- 
GitLab


From 090a8f2980e623ae07b01f9d544979073b866a12 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 09:07:38 +0200
Subject: [PATCH 08/52] Added operator() to StaticArray, Array, ArrayView and
 ExpressionTemplates

Hence, all StaticArray, Array, ArrayView and even expression templates are
directly usable in reduction without the need to create a wrapping fetch
functor. Also NDArray has this interface in 1D.
---
 src/TNL/Containers/Array.h                    | 14 ++++++++++
 src/TNL/Containers/Array.hpp                  | 26 ++++++++++++++++++-
 src/TNL/Containers/ArrayView.h                | 14 ++++++++++
 src/TNL/Containers/ArrayView.hpp              | 24 ++++++++++++++++-
 .../Expressions/ExpressionTemplates.h         | 24 +++++++++++++++++
 src/TNL/Containers/StaticArray.h              | 16 ++++++++++++
 src/TNL/Containers/StaticArray.hpp            | 15 +++++++++++
 src/UnitTests/Containers/ArrayTest.h          | 17 +++++++-----
 src/UnitTests/Containers/ArrayViewTest.h      | 19 +++++++++-----
 src/UnitTests/Containers/StaticArrayTest.cpp  | 12 +++++++++
 10 files changed, 166 insertions(+), 15 deletions(-)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index f2c9ca705..3bbd5efb0 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -498,6 +498,20 @@ class Array
        */
       __cuda_callable__ const Value& operator[]( IndexType i ) const;
 
+      /**
+       * \brief Accesses the \e i-th element of the array.
+       *
+       * Equivalent to \ref operator[], with the same notes and caveats.
+       */
+      __cuda_callable__ Value& operator()( IndexType i );
+
+      /**
+       * \brief Accesses the \e i-th element of the array.
+       *
+       * Equivalent to \ref operator[], with the same notes and caveats.
+       */
+      __cuda_callable__ const Value& operator()( IndexType i ) const;
+
       /**
        * \brief Copy-assignment operator for copying data from another array.
        *
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index d935840ff..fc3e7193b 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          Array_impl.h  -  description
+                          Array.hpp  -  description
                              -------------------
     begin                : Nov 8, 2012
     copyright            : (C) 2012 by Tomas Oberhuber
@@ -555,6 +555,30 @@ operator[]( IndexType i ) const
    return this->data[ i ];
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+__cuda_callable__
+Value&
+Array< Value, Device, Index, Allocator >::
+operator()( IndexType i )
+{
+   return operator[]( i );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+__cuda_callable__
+const Value&
+Array< Value, Device, Index, Allocator >::
+operator()( IndexType i ) const
+{
+   return operator[]( i );
+}
+
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 31743c1f6..3716ce01e 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -370,6 +370,20 @@ public:
    __cuda_callable__
    const Value& operator[]( IndexType i ) const;
 
+   /**
+    * \brief Accesses the \e i-th element of the array.
+    *
+    * Equivalent to \ref operator[], with the same notes and caveats.
+    */
+   __cuda_callable__ Value& operator()( IndexType i );
+
+   /**
+    * \brief Accesses the \e i-th element of the array.
+    *
+    * Equivalent to \ref operator[], with the same notes and caveats.
+    */
+   __cuda_callable__ const Value& operator()( IndexType i ) const;
+
    /**
     * \brief Compares the array view with another array-like container.
     *
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 8f6b446fe..fd9f95297 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          ArrayView_impl.h  -  description
+                          ArrayView.hpp  -  description
                              -------------------
     begin                : Sep 1, 2018
     copyright            : (C) 2018 by Tomas Oberhuber et al.
@@ -273,6 +273,28 @@ operator[]( IndexType i ) const
    return data[ i ];
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+__cuda_callable__
+Value&
+ArrayView< Value, Device, Index >::
+operator()( IndexType i )
+{
+   return operator[]( i );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+__cuda_callable__
+const Value&
+ArrayView< Value, Device, Index >::
+operator()( IndexType i ) const
+{
+   return operator[]( i );
+}
+
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 11b06e822..29ea9f013 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -88,6 +88,12 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ve
       return Operation::evaluate( op1[ i ], op2[ i ] );
    }
 
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
+   }
+
    __cuda_callable__
    IndexType getSize() const
    {
@@ -131,6 +137,12 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ar
       return Operation::evaluate( op1[ i ], op2 );
    }
 
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
+   }
+
    __cuda_callable__
    IndexType getSize() const
    {
@@ -174,6 +186,12 @@ struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorEx
       return Operation::evaluate( op1, op2[ i ] );
    }
 
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
+   }
+
    __cuda_callable__
    IndexType getSize() const
    {
@@ -218,6 +236,12 @@ struct UnaryExpressionTemplate
       return Operation::evaluate( operand[ i ] );
    }
 
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
+   }
+
    __cuda_callable__
    IndexType getSize() const
    {
diff --git a/src/TNL/Containers/StaticArray.h b/src/TNL/Containers/StaticArray.h
index 4f7f753c2..ba5f65ed2 100644
--- a/src/TNL/Containers/StaticArray.h
+++ b/src/TNL/Containers/StaticArray.h
@@ -135,6 +135,22 @@ public:
    __cuda_callable__
    Value& operator[]( int i );
 
+   /**
+    * \brief Accesses specified element at the position \e i and returns a constant reference to its value.
+    *
+    * Equivalent to \ref operator[].
+    */
+   __cuda_callable__
+   const Value& operator()( int i ) const;
+
+   /**
+    * \brief Accesses specified element at the position \e i and returns a reference to its value.
+    *
+    * Equivalent to \ref operator[].
+    */
+   __cuda_callable__
+   Value& operator()( int i );
+
    /**
     * \brief Returns reference to the first coordinate.
     */
diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index c6c18fb0b..d356c9c3a 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -190,6 +190,21 @@ Value& StaticArray< Size, Value >::operator[]( int i )
    TNL_ASSERT_LT( i, Size, "Element index is out of bounds." );
    return data[ i ];
 }
+
+template< int Size, typename Value >
+__cuda_callable__
+const Value& StaticArray< Size, Value >::operator()( int i ) const
+{
+   return operator[]( i );
+}
+
+template< int Size, typename Value >
+__cuda_callable__
+Value& StaticArray< Size, Value >::operator()( int i )
+{
+   return operator[]( i );
+}
+
 template< int Size, typename Value >
 __cuda_callable__
 Value& StaticArray< Size, Value >::x()
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index feae975e4..fe7dd55e4 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -386,6 +386,7 @@ void testArrayElementwiseAccess( Array< Value, Devices::Sequential, Index >&& u
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
@@ -398,15 +399,17 @@ void testArrayElementwiseAccess( Array< Value, Devices::Host, Index >&& u )
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
 #ifdef HAVE_CUDA
 template< typename ValueType, typename IndexType >
-__global__ void testSetGetElementKernel( Array< ValueType, Devices::Cuda, IndexType >* u )
+__global__ void testSetGetElementKernel( Array< ValueType, Devices::Cuda, IndexType >* u,
+                                         Array< ValueType, Devices::Cuda, IndexType >* v )
 {
-   if( threadIdx.x < ( *u ).getSize() )
-      ( *u )[ threadIdx.x ] = threadIdx.x;
+   if( threadIdx.x < u->getSize() )
+      ( *u )[ threadIdx.x ] = ( *v )( threadIdx.x ) = threadIdx.x;
 }
 #endif /* HAVE_CUDA */
 
@@ -414,14 +417,16 @@ template< typename Value, typename Index >
 void testArrayElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u )
 {
 #ifdef HAVE_CUDA
-   u.setSize( 10 );
    using ArrayType = Array< Value, Devices::Cuda, Index >;
-   Pointers::DevicePointer< ArrayType > kernel_u( u );
-   testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >() );
+   u.setSize( 10 );
+   ArrayType v( 10 );
+   Pointers::DevicePointer< ArrayType > kernel_u( u ), kernel_v( v );
+   testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >(), &kernel_v.template modifyData< Devices::Cuda >() );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    for( int i = 0; i < 10; i++ ) {
       EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( v.getElement( i ), i );
    }
 #endif
 }
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index 48274181e..8b9e8157b 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -248,6 +248,7 @@ void testArrayViewElementwiseAccess( Array< Value, Devices::Sequential, Index >&
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
@@ -262,30 +263,34 @@ void testArrayViewElementwiseAccess( Array< Value, Devices::Host, Index >&& a )
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
 #ifdef HAVE_CUDA
 template< typename ValueType, typename IndexType >
-__global__ void testSetGetElementKernel( ArrayView< ValueType, Devices::Cuda, IndexType > v )
+__global__ void testSetGetElementKernel( ArrayView< ValueType, Devices::Cuda, IndexType > u,
+                                         ArrayView< ValueType, Devices::Cuda, IndexType > v )
 {
    if( threadIdx.x < v.getSize() )
-      v[ threadIdx.x ] = threadIdx.x;
+      u[ threadIdx.x ] = v( threadIdx.x ) = threadIdx.x;
 }
 #endif // HAVE_CUDA
 
 template< typename Value, typename Index >
-void testArrayViewElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u )
+void testArrayViewElementwiseAccess( Array< Value, Devices::Cuda, Index >&& a )
 {
 #ifdef HAVE_CUDA
-   u.setSize( 10 );
    using ArrayType = Array< Value, Devices::Cuda, Index >;
    using ViewType = ArrayView< Value, Devices::Cuda, Index >;
-   ViewType v( u );
-   testSetGetElementKernel<<< 1, 16 >>>( v );
+   a.setSize( 10 );
+   ArrayType b( 10 );
+   ViewType u( a ), v( b );
+   testSetGetElementKernel<<< 1, 16 >>>( u, v );
    TNL_CHECK_CUDA_DEVICE;
    for( int i = 0; i < 10; i++ ) {
-      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
    }
 #endif
 }
diff --git a/src/UnitTests/Containers/StaticArrayTest.cpp b/src/UnitTests/Containers/StaticArrayTest.cpp
index b22afa798..e491b2021 100644
--- a/src/UnitTests/Containers/StaticArrayTest.cpp
+++ b/src/UnitTests/Containers/StaticArrayTest.cpp
@@ -117,6 +117,8 @@ void checkCoordinates( StaticArray< 1, Value >& u )
    EXPECT_EQ( u.x(), 0 );
    u.x() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
 }
 
 template< typename Value >
@@ -127,7 +129,11 @@ void checkCoordinates( StaticArray< 2, Value >& u )
    u.x() += 1;
    u.y() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
    EXPECT_EQ( u.y(), 2 );
+   EXPECT_EQ( u[ 1 ], 2 );
+   EXPECT_EQ( u( 1 ), 2 );
 }
 
 template< typename Value >
@@ -140,8 +146,14 @@ void checkCoordinates( StaticArray< 3, Value >& u )
    u.y() += 1;
    u.z() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
    EXPECT_EQ( u.y(), 2 );
+   EXPECT_EQ( u[ 1 ], 2 );
+   EXPECT_EQ( u( 1 ), 2 );
    EXPECT_EQ( u.z(), 3 );
+   EXPECT_EQ( u[ 2 ], 3 );
+   EXPECT_EQ( u( 2 ), 3 );
 }
 
 template< int _size, typename Value >
-- 
GitLab


From a1e3a62d8900664d8977a848314894745e0ac532 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 09:31:04 +0200
Subject: [PATCH 09/52] Added static_asserts to the getIdempotent methods in
 Functional.h

Also fixed the idempotent values for Max and MaxWithArg
(std::numerical_limits<T>::lowest() vs std::numerical_limits<T>::min())
---
 src/TNL/Functional.h | 45 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index da4aa93fc..d683ff639 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -23,7 +23,7 @@ namespace TNL {
 struct Plus : public std::plus< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 0; };
+   static constexpr T getIdempotent() { return 0; }
 };
 
 /**
@@ -32,7 +32,7 @@ struct Plus : public std::plus< void >
 struct Multiplies : public std::multiplies< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 1; };
+   static constexpr T getIdempotent() { return 1; }
 };
 
 /**
@@ -41,7 +41,12 @@ struct Multiplies : public std::multiplies< void >
 struct Min
 {
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+   static constexpr T getIdempotent()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
 
    template< typename Value >
    constexpr Value operator()( const Value& lhs, const Value& rhs ) const
@@ -58,7 +63,12 @@ struct Min
 struct Max
 {
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+   static constexpr T getIdempotent()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::lowest();
+   }
 
    template< typename Value >
    constexpr Value operator()( const Value& lhs, const Value& rhs ) const
@@ -75,7 +85,12 @@ struct Max
 struct MinWithArg
 {
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+   static constexpr T getIdempotent()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
 
    template< typename Value, typename Index >
    constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
@@ -98,7 +113,12 @@ struct MinWithArg
 struct MaxWithArg
 {
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+   static constexpr T getIdempotent()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::lowest();
+   }
 
    template< typename Value, typename Index >
    constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
@@ -121,7 +141,12 @@ struct MaxWithArg
 struct LogicalAnd : public std::logical_and< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return true; };
+   static constexpr T getIdempotent()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
 };
 
 /**
@@ -130,7 +155,7 @@ struct LogicalAnd : public std::logical_and< void >
 struct LogicalOr : public std::logical_or< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return false; };
+   static constexpr T getIdempotent() { return 0; }
 };
 
 /**
@@ -139,7 +164,7 @@ struct LogicalOr : public std::logical_or< void >
 struct BitAnd : public std::bit_and< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
+   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); }
 };
 
 /**
@@ -148,7 +173,7 @@ struct BitAnd : public std::bit_and< void >
 struct BitOr : public std::bit_or< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 0; };
+   static constexpr T getIdempotent() { return 0; }
 };
 
 } // namespace TNL
-- 
GitLab


From 72ad8e30364cf88b62441983fe346e08ef8cafa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 09:54:53 +0200
Subject: [PATCH 10/52] Removed unnecessary lambda functions from expression
 templates

---
 .../Expressions/VerticalOperations.h          | 71 +++----------------
 src/UnitTests/Containers/VectorTest.h         |  4 +-
 2 files changed, 13 insertions(+), 62 deletions(-)

diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 704c7f53d..385096303 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -13,6 +13,7 @@
 #include <limits>
 #include <type_traits>
 
+#include <TNL/Functional.h>
 #include <TNL/Algorithms/reduce.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 
@@ -34,16 +35,7 @@ auto ExpressionMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b )
-   {
-      // use argument-dependent lookup and make TNL::min available for unqualified calls
-      using TNL::min;
-      return min( a, b );
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Min{}, TNL::Min::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -54,18 +46,7 @@ auto ExpressionArgMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( ResultType& a, const ResultType& b, IndexType& aIdx, const IndexType& bIdx ) {
-      if( a > b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MinWithArg{}, TNL::MinWithArg::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -76,16 +57,7 @@ auto ExpressionMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b )
-   {
-      // use argument-dependent lookup and make TNL::max available for unqualified calls
-      using TNL::max;
-      return max( a, b );
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Max{}, TNL::Max::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -96,18 +68,7 @@ auto ExpressionArgMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( ResultType& a, const ResultType& b, IndexType& aIdx, const IndexType& bIdx ) {
-      if( a < b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MaxWithArg{}, TNL::MaxWithArg::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -118,8 +79,7 @@ auto ExpressionSum( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Plus{}, TNL::Plus::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -130,8 +90,7 @@ auto ExpressionProduct( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Multiplies{}, TNL::Multiplies::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -142,10 +101,7 @@ auto ExpressionLogicalAnd( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalAnd{}, TNL::LogicalAnd::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -156,8 +112,7 @@ auto ExpressionLogicalOr( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalOr{}, TNL::LogicalOr::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -168,10 +123,7 @@ auto ExpressionBinaryAnd( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitAnd{}, TNL::BitAnd::template getIdempotent< ResultType >() );
 }
 
 template< typename Expression >
@@ -182,8 +134,7 @@ auto ExpressionBinaryOr( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitOr{}, TNL::BitOr::template getIdempotent< ResultType >() );
 }
 
 } // namespace Expressions
diff --git a/src/UnitTests/Containers/VectorTest.h b/src/UnitTests/Containers/VectorTest.h
index 8dd7d270a..7c54ab578 100644
--- a/src/UnitTests/Containers/VectorTest.h
+++ b/src/UnitTests/Containers/VectorTest.h
@@ -261,7 +261,7 @@ TEST( VectorSpecialCasesTest, reductionOfEmptyVector )
    EXPECT_EQ( product(v), 1 );
    EXPECT_EQ( logicalAnd(v), true );
    EXPECT_EQ( logicalOr(v), false );
-   EXPECT_EQ( binaryAnd(v), std::numeric_limits< int >::max() );
+   EXPECT_EQ( binaryAnd(v), ~0 );
    EXPECT_EQ( binaryOr(v), 0 );
 
    EXPECT_EQ( min(v_view), std::numeric_limits< int >::max() );
@@ -272,7 +272,7 @@ TEST( VectorSpecialCasesTest, reductionOfEmptyVector )
    EXPECT_EQ( product(v_view), 1 );
    EXPECT_EQ( logicalAnd(v_view), true );
    EXPECT_EQ( logicalOr(v_view), false );
-   EXPECT_EQ( binaryAnd(v_view), std::numeric_limits< int >::max() );
+   EXPECT_EQ( binaryAnd(v_view), ~0 );
    EXPECT_EQ( binaryOr(v_view), 0 );
 }
 
-- 
GitLab


From a3f0ad65acef8432f0008c97781acf5f8c15e88f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 09:55:30 +0200
Subject: [PATCH 11/52] reduce: fixed the Result type in case the fetch functor
 returns a reference

---
 src/TNL/Algorithms/reduce.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index 875e84319..e0c4bfed5 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -123,7 +123,7 @@ auto reduce( const Index begin,
              Fetch&& fetch,
              Reduction&& reduction )
 {
-   using Result = decltype( fetch( ( Index ) 0 ) );
+   using Result = std::decay_t< decltype( fetch( 0 ) ) >;
    return detail::Reduction< Device >::reduce( begin,
                                                end,
                                                std::forward< Fetch >( fetch ),
@@ -250,7 +250,7 @@ reduceWithArgument( const Index begin,
                     Fetch&& fetch,
                     Reduction&& reduction )
 {
-   using Result = decltype( fetch( ( Index ) 0 ) );
+   using Result = std::decay_t< decltype( fetch( 0 ) ) >;
    return detail::Reduction< Device >::reduceWithArgument( begin,
                                                            end,
                                                            std::forward< Fetch >( fetch ),
-- 
GitLab


From 49839f6a9dc1e8a42602effbd1a42b3843508d4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 10:00:00 +0200
Subject: [PATCH 12/52] Fixed formatting in reduce.h and removed unused
 includes

---
 src/TNL/Algorithms/reduce.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index e0c4bfed5..877fb2c0b 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -15,9 +15,6 @@
 #include <utility>  // std::pair, std::forward
 
 #include <TNL/Functional.h>  // extension of STL functionals for reduction
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
 #include <TNL/Algorithms/detail/Reduction.h>
 
 namespace TNL {
@@ -78,7 +75,11 @@ Result reduce( const Index begin,
                Reduction&& reduction,
                const Result& zero )
 {
-    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduction >( reduction ), zero );
+   return detail::Reduction< Device >::reduce( begin,
+                                               end,
+                                               std::forward< Fetch >( fetch ),
+                                               std::forward< Reduction >( reduction ),
+                                               zero );
 }
 
 /**
@@ -187,11 +188,11 @@ reduceWithArgument( const Index begin,
                     Reduction&& reduction,
                     const Result& zero )
 {
-    return detail::Reduction< Device >::reduceWithArgument( begin,
-                                                            end,
-                                                            std::forward< Fetch >( fetch ),
-                                                            std::forward< Reduction >( reduction ),
-                                                            zero );
+   return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                           end,
+                                                           std::forward< Fetch >( fetch ),
+                                                           std::forward< Reduction >( reduction ),
+                                                           zero );
 }
 
 /**
-- 
GitLab


From 63d567e4352b5a3ac436ab3c8af8a1f89a19e87a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 20:18:29 +0200
Subject: [PATCH 13/52] Removed useless DeviceType from detail::Reduction

---
 src/TNL/Algorithms/detail/Reduction.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
index 5db002cdd..ca195077a 100644
--- a/src/TNL/Algorithms/detail/Reduction.h
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -40,8 +40,6 @@ struct Reduction;
 template<>
 struct Reduction< Devices::Sequential >
 {
-   using DeviceType = Devices::Sequential;
-
    /**
     * \brief Computes reduction on CPU sequentially.
     *
@@ -106,8 +104,6 @@ struct Reduction< Devices::Sequential >
 template<>
 struct Reduction< Devices::Host >
 {
-   using DeviceType = Devices::Host;
-
    /**
     * \brief Computes reduction on CPU.
     *
@@ -172,8 +168,6 @@ struct Reduction< Devices::Host >
 template<>
 struct Reduction< Devices::Cuda >
 {
-   using DeviceType = Devices::Cuda;
-
    /**
     * \brief Computes reduction on GPU.
     *
-- 
GitLab


From a4e15b0836bd33a9117e28ad5d37065d1fdf979f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 13:58:06 +0200
Subject: [PATCH 14/52] Refactored parallel OpenMP scan

The first phase performs only per-block reduction, not scan. The output
array elements are written only in the second phase, so overall we
perform only `n` instead of `2n` write operations.
---
 src/TNL/Algorithms/Scan.hpp | 124 +++++++++++++++++++++---------------
 1 file changed, 72 insertions(+), 52 deletions(-)

diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index 54780b48f..d7105b91d 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -104,15 +104,42 @@ perform( Vector& v,
          const typename Vector::ValueType zero )
 {
 #ifdef HAVE_OPENMP
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) {
-      const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
-      performSecondPhase( v, blockShifts, begin, end, reduction, zero );
+   using ValueType = typename Vector::ValueType;
+   using IndexType = typename Vector::IndexType;
+
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
+
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      Containers::Array< ValueType > block_results( blocks + 1 );
+
+      #pragma omp parallel num_threads(threads)
+      {
+         const IndexType block_idx = omp_get_thread_num();
+         const IndexType block_begin = begin + block_idx * block_size;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
+
+         // step 1: per-block reductions, write the result into the buffer
+         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, v, reduction, zero );
+
+         #pragma omp barrier
+
+         // step 2: scan the block results
+         #pragma omp single
+         {
+            Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, blocks + 1, reduction, zero );
+         }
+
+         // step 3: per-block scan using the block results as initial values
+         Scan< Devices::Sequential, Type >::perform( v, block_begin, block_end, reduction, block_results[ block_idx ] );
+      }
    }
    else
-      Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
-#else
-   Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
 #endif
+      Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
 }
 
 template< ScanType Type >
@@ -130,45 +157,34 @@ performFirstPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   const int threads = Devices::Host::getMaxThreadsCount();
-   Containers::Array< ValueType > block_results( threads + 1 );
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
 
-   #pragma omp parallel num_threads(threads)
-   {
-      // init
-      const int thread_idx = omp_get_thread_num();
-      ValueType block_result = zero;
-
-      // perform scan on blocks statically assigned to threads
-      if( Type == ScanType::Inclusive ) {
-         #pragma omp for schedule(static)
-         for( IndexType i = begin; i < end; i++ ) {
-            block_result = reduction( block_result, v[ i ] );
-            v[ i ] = block_result;
-         }
-      }
-      else {
-         #pragma omp for schedule(static)
-         for( IndexType i = begin; i < end; i++ ) {
-            const ValueType x = v[ i ];
-            v[ i ] = block_result;
-            block_result = reduction( block_result, x );
-         }
-      }
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      Containers::Array< ValueType, Devices::Sequential > block_results( blocks + 1 );
 
-      // write the block result into the buffer
-      block_results[ thread_idx ] = block_result;
-   }
+      #pragma omp parallel num_threads(threads)
+      {
+         const IndexType block_idx = omp_get_thread_num();
+         const IndexType block_begin = begin + block_idx * block_size;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
 
-   // block_results now contains scan results for each block. The first phase
-   // ends by computing an exclusive scan of this array.
-   Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, threads + 1, reduction, zero );
+         // step 1: per-block reductions, write the result into the buffer
+         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, v, reduction, zero );
+      }
 
-   // block_results now contains shift values for each block - to be used in the second phase
-   return block_results;
-#else
-   return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
+      // step 2: scan the block results
+      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, blocks + 1, reduction, zero );
+
+      // block_results now contains shift values for each block - to be used in the second phase
+      return block_results;
+   }
+   else
 #endif
+      return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
 }
 
 template< ScanType Type >
@@ -188,22 +204,26 @@ performSecondPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   const int threads = blockShifts.getSize() - 1;
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
 
-   // launch exactly the same number of threads as in the first phase
-   #pragma omp parallel num_threads(threads)
-   {
-      const int thread_idx = omp_get_thread_num();
-      const ValueType offset = reduction( zero, blockShifts[ thread_idx ] );
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      #pragma omp parallel num_threads(threads)
+      {
+         const IndexType block_idx = omp_get_thread_num();
+         const IndexType block_begin = begin + block_idx * block_size;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
 
-      // shift intermediate results by the offset
-      #pragma omp for schedule(static)
-      for( IndexType i = begin; i < end; i++ )
-         v[ i ] = reduction( v[ i ], offset );
+         // phase 2: per-block scan using the block results as initial values
+         Scan< Devices::Sequential, Type >::perform( v, block_begin, block_end, reduction, reduction( zero, blockShifts[ block_idx ] ) );
+      }
    }
-#else
-   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, zero );
+   else
 #endif
+      Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, zero );
 }
 
 template< ScanType Type >
-- 
GitLab


From e347b4862f6d4c6894ad5eccf108b32cf32267bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 21:04:52 +0200
Subject: [PATCH 15/52] Moved segmented scan into its own header file under
 Algorithms

Also moved the test under Algorithms and made sure it is actually
being compiled.
---
 src/TNL/Algorithms/Scan.h                     | 178 ---------------
 src/TNL/Algorithms/Scan.hpp                   |  83 -------
 src/TNL/Algorithms/SegmentedScan.h            | 204 ++++++++++++++++++
 src/TNL/Algorithms/SegmentedScan.hpp          | 104 +++++++++
 src/TNL/Containers/VectorView.h               |   1 +
 src/UnitTests/Algorithms/CMakeLists.txt       |   2 +-
 .../Algorithms/SegmentedScanTest.cpp          |   1 +
 src/UnitTests/Algorithms/SegmentedScanTest.h  | 183 ++++++++++++++++
 .../Containers/VectorPrefixSumTest.h          |  78 -------
 9 files changed, 494 insertions(+), 340 deletions(-)
 create mode 100644 src/TNL/Algorithms/SegmentedScan.h
 create mode 100644 src/TNL/Algorithms/SegmentedScan.hpp
 create mode 100644 src/UnitTests/Algorithms/SegmentedScanTest.cpp
 create mode 100644 src/UnitTests/Algorithms/SegmentedScanTest.h

diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h
index 4307aee65..348364802 100644
--- a/src/TNL/Algorithms/Scan.h
+++ b/src/TNL/Algorithms/Scan.h
@@ -53,50 +53,6 @@ template< typename Device,
           ScanType Type = ScanType::Inclusive >
 struct Scan;
 
-/**
- * \brief Computes segmented scan (or prefix sum) on a vector.
- *
- * Segmented scan is a modification of common scan. In this case the sequence of
- * numbers in hand is divided into segments like this, for example
- *
- * ```
- * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
- * ```
- *
- * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
- *
- * ```
- * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
- * ```
- *
- * and for exclusive segmented prefix sum it is
- *
- * ```
- * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
- * ```
- *
- * In addition to common scan, we need to encode the segments of the input sequence.
- * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
- * beginning of each segment and `0` on all other positions. In our example, it would be like this:
- *
- * ```
- * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
- * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
- *
- * ```
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
- *
- * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
- *
- * **Note: Segmented scan is not implemented for CUDA yet.**
- */
-template< typename Device,
-          ScanType Type = ScanType::Inclusive >
-struct SegmentedScan;
-
-
 template< ScanType Type >
 struct Scan< Devices::Sequential, Type >
 {
@@ -277,140 +233,6 @@ struct Scan< Devices::Cuda, Type >
                        const typename Vector::ValueType zero );
 };
 
-template< ScanType Type >
-struct SegmentedScan< Devices::Sequential, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) sequentially.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-};
-
-template< ScanType Type >
-struct SegmentedScan< Devices::Host, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) using OpenMP.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-};
-
-template< ScanType Type >
-struct SegmentedScan< Devices::Cuda, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) on GPU.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    *
-    * **Note: Segmented scan is not implemented for CUDA yet.**
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-};
-
 } // namespace Algorithms
 } // namespace TNL
 
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index d7105b91d..17de19a41 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -20,7 +20,6 @@
 #include <TNL/Containers/StaticArray.h>
 #include <TNL/Algorithms/detail/CudaScanKernel.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
 namespace Algorithms {
@@ -306,87 +305,5 @@ performSecondPhase( Vector& v,
 #endif
 }
 
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Sequential, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
-{
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
-
-   if( Type == ScanType::Inclusive )
-   {
-      for( IndexType i = begin + 1; i < end; i++ )
-         if( ! flags[ i ] )
-            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
-   }
-   else // Exclusive scan
-   {
-      ValueType aux( v[ begin ] );
-      v[ begin ] = zero;
-      for( IndexType i = begin + 1; i < end; i++ )
-      {
-         ValueType x = v[ i ];
-         if( flags[ i ] )
-            aux = zero;
-         v[ i ] = aux;
-         aux = reduction( aux, x );
-      }
-   }
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Host, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
-{
-#ifdef HAVE_OPENMP
-   // TODO: parallelize with OpenMP
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
-#else
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Cuda, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
-{
-#ifdef HAVE_CUDA
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
-
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/SegmentedScan.h b/src/TNL/Algorithms/SegmentedScan.h
new file mode 100644
index 000000000..f16a5335c
--- /dev/null
+++ b/src/TNL/Algorithms/SegmentedScan.h
@@ -0,0 +1,204 @@
+/***************************************************************************
+                          SegmentedScan.h  -  description
+                             -------------------
+    begin                : May 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+#include "Scan.h"  // only for the ScanType
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes segmented scan (or prefix sum) on a vector.
+ *
+ * Segmented scan is a modification of common scan. In this case the sequence of
+ * numbers in hand is divided into segments like this, for example
+ *
+ * ```
+ * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
+ * ```
+ *
+ * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
+ *
+ * ```
+ * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
+ * ```
+ *
+ * and for exclusive segmented prefix sum it is
+ *
+ * ```
+ * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
+ * ```
+ *
+ * In addition to common scan, we need to encode the segments of the input sequence.
+ * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
+ * beginning of each segment and `0` on all other positions. In our example, it would be like this:
+ *
+ * ```
+ * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
+ * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
+ *
+ * ```
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
+ *
+ * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
+ *
+ * **Note: Segmented scan is not implemented for CUDA yet.**
+ */
+template< typename Device,
+          ScanType Type = ScanType::Inclusive >
+struct SegmentedScan;
+
+template< ScanType Type >
+struct SegmentedScan< Devices::Sequential, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) sequentially.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType zero );
+};
+
+template< ScanType Type >
+struct SegmentedScan< Devices::Host, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) using OpenMP.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType zero );
+};
+
+template< ScanType Type >
+struct SegmentedScan< Devices::Cuda, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) on GPU.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    *
+    * **Note: Segmented scan is not implemented for CUDA yet.**
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType zero );
+};
+
+} // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/SegmentedScan.hpp>
diff --git a/src/TNL/Algorithms/SegmentedScan.hpp b/src/TNL/Algorithms/SegmentedScan.hpp
new file mode 100644
index 000000000..467146e51
--- /dev/null
+++ b/src/TNL/Algorithms/SegmentedScan.hpp
@@ -0,0 +1,104 @@
+/***************************************************************************
+                          SegmentedScan.hpp  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include "SegmentedScan.h"
+
+#include <TNL/Exceptions/NotImplementedError.h>
+
+namespace TNL {
+namespace Algorithms {
+
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Sequential, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType zero )
+{
+   using ValueType = typename Vector::ValueType;
+   using IndexType = typename Vector::IndexType;
+
+   if( Type == ScanType::Inclusive )
+   {
+      for( IndexType i = begin + 1; i < end; i++ )
+         if( ! flags[ i ] )
+            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
+   }
+   else // Exclusive scan
+   {
+      ValueType aux( v[ begin ] );
+      v[ begin ] = zero;
+      for( IndexType i = begin + 1; i < end; i++ )
+      {
+         ValueType x = v[ i ];
+         if( flags[ i ] )
+            aux = zero;
+         v[ i ] = aux;
+         aux = reduction( aux, x );
+      }
+   }
+}
+
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Host, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType zero )
+{
+#ifdef HAVE_OPENMP
+   // TODO: parallelize with OpenMP
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+#else
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+#endif
+}
+
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Cuda, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType zero )
+{
+#ifdef HAVE_CUDA
+   using ValueType = typename Vector::ValueType;
+   using IndexType = typename Vector::IndexType;
+
+   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 2416b8509..55983f43e 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -15,6 +15,7 @@
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
 #include <TNL/Algorithms/Scan.h>
+#include <TNL/Algorithms/SegmentedScan.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index a9a5db9ce..6ac0af432 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -10,7 +10,7 @@ set( COMMON_TESTS
          unrolledForTest
 )
 
-set( CPP_TESTS )
+set( CPP_TESTS SegmentedScanTest )
 set( CUDA_TESTS )
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} ${COMMON_TESTS} )
diff --git a/src/UnitTests/Algorithms/SegmentedScanTest.cpp b/src/UnitTests/Algorithms/SegmentedScanTest.cpp
new file mode 100644
index 000000000..74d5a80a3
--- /dev/null
+++ b/src/UnitTests/Algorithms/SegmentedScanTest.cpp
@@ -0,0 +1 @@
+#include "SegmentedScanTest.h"
diff --git a/src/UnitTests/Algorithms/SegmentedScanTest.h b/src/UnitTests/Algorithms/SegmentedScanTest.h
new file mode 100644
index 000000000..7f141fd72
--- /dev/null
+++ b/src/UnitTests/Algorithms/SegmentedScanTest.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+
+#include <TNL/Arithmetics/Quad.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/SegmentedScan.h>
+
+#include "gtest/gtest.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+
+// should be small enough to have fast tests, but larger than minGPUReductionDataSize
+// and large enough to require multiple CUDA blocks for reduction
+constexpr int ARRAY_TEST_SIZE = 10000;
+
+// test fixture for typed tests
+template< typename Array >
+class SegmentedScanTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+   using ViewType = ArrayView< typename Array::ValueType, typename Array::DeviceType, typename Array::IndexType >;
+};
+
+// types for which SegmentedScanTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< int,            Devices::Sequential, short >,
+   Array< long,           Devices::Sequential, short >,
+   Array< float,          Devices::Sequential, short >,
+   Array< double,         Devices::Sequential, short >,
+   //Array< Quad< float >,  Devices::Sequential, short >,
+   //Array< Quad< double >, Devices::Sequential, short >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< float,          Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< float,          Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< int,            Devices::Host, short >,
+   Array< long,           Devices::Host, short >,
+   Array< float,          Devices::Host, short >,
+   Array< double,         Devices::Host, short >,
+   //Array< Quad< float >,  Devices::Host, short >,
+   //Array< Quad< double >, Devices::Host, short >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< float,          Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< float,          Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+// TODO: segmented scan for CUDA is not implemented yet
+//#ifdef HAVE_CUDA
+//   Array< int,            Devices::Cuda, short >,
+//   Array< long,           Devices::Cuda, short >,
+//   Array< float,          Devices::Cuda, short >,
+//   Array< double,         Devices::Cuda, short >,
+//   //Array< Quad< float >,  Devices::Cuda, short >,
+//   //Array< Quad< double >, Devices::Cuda, short >,
+//   Array< int,            Devices::Cuda, int >,
+//   Array< long,           Devices::Cuda, int >,
+//   Array< float,          Devices::Cuda, int >,
+//   Array< double,         Devices::Cuda, int >,
+//   //Array< Quad< float >,  Devices::Cuda, int >,
+//   //Array< Quad< double >, Devices::Cuda, int >,
+//   Array< int,            Devices::Cuda, long >,
+//   Array< long,           Devices::Cuda, long >,
+//   Array< float,          Devices::Cuda, long >,
+//   Array< double,         Devices::Cuda, long >
+//   //Array< Quad< float >,  Devices::Cuda, long >,
+//   //Array< Quad< double >, Devices::Cuda, long >
+//#endif
+>;
+
+TYPED_TEST_SUITE( SegmentedScanTest, ArrayTypes );
+
+template< typename Array >
+void setLinearSequence( Array& array )
+{
+   using Value = typename Array::ValueType;
+   using Index = typename Array::IndexType;
+   auto f1 = [] __cuda_callable__ ( Index i, Value& value ) { value = i; };
+   array.forAllElements( f1 );
+}
+
+template< typename FlagsView >
+void setupFlags( FlagsView& flags )
+{
+   using Value = typename FlagsView::ValueType;
+   using Index = typename FlagsView::IndexType;
+   auto f1 = [] __cuda_callable__ ( Index i, Value& value ) { value = ( i % 5 == 0 ); };
+   flags.forAllElements( f1 );
+}
+
+TYPED_TEST( SegmentedScanTest, inclusive )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   using ViewType = typename TestFixture::ViewType;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using FlagsArrayType = Array< bool, DeviceType, IndexType >;
+   using FlagsViewType = ArrayView< bool, DeviceType, IndexType >;
+   const int size = ARRAY_TEST_SIZE;
+
+   ArrayType v( size );
+   ViewType v_view( v );
+
+   FlagsArrayType flags( size ), flags_copy( size );
+   FlagsViewType flags_view( flags );
+   setupFlags( flags_view );
+   flags_copy = flags_view;
+
+   v = 0;
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v.getElement( i ), 0 );
+   flags_view = flags_copy;
+
+   v = 1;
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v.getElement( i ), ( i % 5 ) + 1 );
+   flags_view = flags_copy;
+
+   setLinearSequence( v );
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 1; i < size; i++ )
+   {
+      if( flags.getElement( i ) )
+         EXPECT_EQ( v.getElement( i ), i );
+      else
+         EXPECT_EQ( v.getElement( i ) - v.getElement( i - 1 ), i );
+   }
+   flags_view = flags_copy;
+
+   v_view = 0;
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_view.getElement( i ), 0 );
+   flags_view = flags_copy;
+
+   v_view = 1;
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_view.getElement( i ), ( i % 5 ) + 1 );
+   flags_view = flags_copy;
+
+   setLinearSequence( v );
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   for( int i = 1; i < size; i++ )
+   {
+      if( flags.getElement( i ) )
+         EXPECT_EQ( v_view.getElement( i ), i );
+      else
+         EXPECT_EQ( v_view.getElement( i ) - v_view.getElement( i - 1 ), i );
+   }
+}
+
+// TODO: test exclusive segmented scan
+
+#endif // HAVE_GTEST
+
+#include "../main.h"
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 3c52e9eef..3bf99d997 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -264,84 +264,6 @@ TYPED_TEST( VectorTest, exclusiveScan )
 
 // TODO: test scan with custom begin and end parameters
 
-
-template< typename FlagsView >
-void setupFlags( FlagsView& f )
-{
-   auto f1 = [] __cuda_callable__ ( typename FlagsView::IndexType i ) { return ( i % 5 ) == 0; };
-   f.evaluate( f1 );
-}
-
-/*
-TYPED_TEST( VectorTest, segmentedScan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using FlagsArrayType = Array< bool, DeviceType, IndexType >;
-   using FlagsViewType = ArrayView< bool, DeviceType, IndexType >;
-   const int size = VECTOR_TEST_SIZE;
-
-   VectorType v( size );
-   ViewType v_view( v );
-
-   FlagsArrayType flags( size ), flags_copy( size );
-   FlagsViewType flags_view( flags );
-   //auto f1 = [] __cuda_callable__ ( IndexType i ) { return ( i % 5 ) == 0; };
-   //flags_view.evaluate( f1 );
-   setupFlags( flags_view );
-   flags_copy = flags_view;
-
-   v = 0;
-   v.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v.getElement( i ), 0 );
-   flags_view = flags_copy;
-
-   v = 1;
-   v.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v.getElement( i ), ( i % 5 ) + 1 );
-   flags_view = flags_copy;
-
-   setLinearSequence( v );
-   v.computeSegmentedScan( flags_view );
-   for( int i = 1; i < size; i++ )
-   {
-      if( flags.getElement( i ) )
-         EXPECT_EQ( v.getElement( i ), i );
-      else
-         EXPECT_EQ( v.getElement( i ) - v.getElement( i - 1 ), i );
-   }
-   flags_view = flags_copy;
-
-   v_view = 0;
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_view.getElement( i ), 0 );
-   flags_view = flags_copy;
-
-   v_view = 1;
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_view.getElement( i ), ( i % 5 ) + 1 );
-   flags_view = flags_copy;
-
-   //v_view.evaluate( [] __cuda_callable__ ( IndexType i ) { return i; } );
-   setLinearSequence( v );
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 1; i < size; i++ )
-   {
-      if( flags.getElement( i ) )
-         EXPECT_EQ( v_view.getElement( i ), i );
-      else
-         EXPECT_EQ( v_view.getElement( i ) - v_view.getElement( i - 1 ), i );
-   }
-}
-*/
-
 #endif // HAVE_GTEST
 
 #include "../main.h"
-- 
GitLab


From 19e9b4e53341b3c829b68664ec78ddcc61850c2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 22:06:40 +0200
Subject: [PATCH 16/52] Moved scan tests from Containers to Algorithms

---
 src/UnitTests/Algorithms/CMakeLists.txt       |  32 +-
 .../Algorithms/DistributedScanTest.cpp        |   1 +
 .../DistributedScanTest.h}                    | 208 ++++++-----
 .../Algorithms/DistributedScanTestCuda.cu     |   1 +
 src/UnitTests/Algorithms/ScanTest.cpp         |   1 +
 src/UnitTests/Algorithms/ScanTest.h           | 329 ++++++++++++++++++
 src/UnitTests/Algorithms/ScanTestCuda.cu      |   1 +
 src/UnitTests/Containers/CMakeLists.txt       |  20 --
 .../Containers/DistributedVectorTest.cpp      |   1 -
 .../Containers/DistributedVectorTestCuda.cu   |   1 -
 .../Containers/VectorHelperFunctions.h        |  14 +-
 .../Containers/VectorPrefixSumTest.cpp        |   1 -
 .../Containers/VectorPrefixSumTest.h          | 269 --------------
 .../Containers/VectorPrefixSumTestCuda.cu     |   1 -
 14 files changed, 470 insertions(+), 410 deletions(-)
 create mode 100644 src/UnitTests/Algorithms/DistributedScanTest.cpp
 rename src/UnitTests/{Containers/DistributedVectorTest.h => Algorithms/DistributedScanTest.h} (54%)
 create mode 100644 src/UnitTests/Algorithms/DistributedScanTestCuda.cu
 create mode 100644 src/UnitTests/Algorithms/ScanTest.cpp
 create mode 100644 src/UnitTests/Algorithms/ScanTest.h
 create mode 100644 src/UnitTests/Algorithms/ScanTestCuda.cu
 delete mode 100644 src/UnitTests/Containers/DistributedVectorTest.cpp
 delete mode 100644 src/UnitTests/Containers/DistributedVectorTestCuda.cu
 delete mode 100644 src/UnitTests/Containers/VectorPrefixSumTest.cpp
 delete mode 100644 src/UnitTests/Containers/VectorPrefixSumTest.h
 delete mode 100644 src/UnitTests/Containers/VectorPrefixSumTestCuda.cu

diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 6ac0af432..92f58def4 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -10,8 +10,13 @@ set( COMMON_TESTS
          unrolledForTest
 )
 
-set( CPP_TESTS SegmentedScanTest )
-set( CUDA_TESTS )
+set( CPP_TESTS
+         ScanTest
+         SegmentedScanTest
+)
+set( CUDA_TESTS
+         ScanTestCuda
+)
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} ${COMMON_TESTS} )
 else()
@@ -32,3 +37,26 @@ if( BUILD_CUDA )
       add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
    endforeach()
 endif()
+
+
+if( ${BUILD_MPI} )
+   ADD_EXECUTABLE( DistributedScanTest DistributedScanTest.cpp )
+   TARGET_COMPILE_OPTIONS( DistributedScanTest PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( DistributedScanTest ${GTEST_BOTH_LIBRARIES} )
+
+   if( BUILD_CUDA )
+      CUDA_ADD_EXECUTABLE( DistributedScanTestCuda DistributedScanTestCuda.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( DistributedScanTestCuda ${GTEST_BOTH_LIBRARIES} )
+   endif()
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME DistributedScanTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME DistributedScanTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+
+   if( BUILD_CUDA )
+      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+      ADD_TEST( NAME DistributedScanTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME DistributedScanTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+   endif()
+endif()
diff --git a/src/UnitTests/Algorithms/DistributedScanTest.cpp b/src/UnitTests/Algorithms/DistributedScanTest.cpp
new file mode 100644
index 000000000..9c78e1ef9
--- /dev/null
+++ b/src/UnitTests/Algorithms/DistributedScanTest.cpp
@@ -0,0 +1 @@
+#include "DistributedScanTest.h"
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Algorithms/DistributedScanTest.h
similarity index 54%
rename from src/UnitTests/Containers/DistributedVectorTest.h
rename to src/UnitTests/Algorithms/DistributedScanTest.h
index 24da5fbe7..5a15187bb 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Algorithms/DistributedScanTest.h
@@ -1,52 +1,48 @@
-/***************************************************************************
-                          DistributedVectorTest.h  -  description
-                             -------------------
-    begin                : Sep 6, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
+#pragma once
 
 #ifdef HAVE_GTEST
 #include <limits>
 
 #include <gtest/gtest.h>
 
-#include <TNL/Containers/DistributedVector.h>
-#include <TNL/Containers/DistributedVectorView.h>
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/Containers/DistributedArrayView.h>
 #include <TNL/Containers/Partitioner.h>
+#include <TNL/Algorithms/DistributedScan.h>
 
 #define DISTRIBUTED_VECTOR
-#include "VectorHelperFunctions.h"
+#include "../Containers/VectorHelperFunctions.h"
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Algorithms;
 using namespace TNL::MPI;
 
 /*
- * Light check of DistributedVector.
+ * Light check of DistributedArray.
  *
  * - Number of processes is not limited.
  * - Global size is hardcoded as 97 to force non-uniform distribution.
  * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
  */
-template< typename DistributedVector >
-class DistributedVectorTest
+template< typename DistributedArray >
+class DistributedScanTest
 : public ::testing::Test
 {
 protected:
-   using RealType = typename DistributedVector::RealType;
-   using DeviceType = typename DistributedVector::DeviceType;
-   using IndexType = typename DistributedVector::IndexType;
-   using DistributedVectorType = DistributedVector;
-   using VectorViewType = typename DistributedVectorType::LocalViewType;
-   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >;
-   using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >;
+   using ValueType = typename DistributedArray::ValueType;
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+   using DistributedArrayType = DistributedArray;
+   using VectorViewType = typename DistributedArrayType::LocalViewType;
+   using DistributedArrayView = Containers::DistributedArrayView< ValueType, DeviceType, IndexType >;
+   using HostDistributedArrayType = typename DistributedArrayType::template Self< ValueType, Devices::Sequential >;
 
    const MPI_Comm group = AllGroup();
 
-   DistributedVectorType v;
-   DistributedVectorView v_view;
-   HostDistributedVectorType v_host;
+   DistributedArrayType v;
+   DistributedArrayView v_view;
+   HostDistributedArrayType v_host;
 
    const int rank = GetRank(group);
    const int nproc = GetSize(group);
@@ -58,9 +54,9 @@ protected:
    // some arbitrary value (but must be 0 if not distributed)
    const int ghosts = (nproc > 1) ? 4 : 0;
 
-   DistributedVectorTest()
+   DistributedScanTest()
    {
-      using LocalRangeType = typename DistributedVector::LocalRangeType;
+      using LocalRangeType = typename DistributedArray::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
       v.setDistribution( localRange, ghosts, globalSize, group );
 
@@ -75,74 +71,70 @@ protected:
    }
 };
 
-// types for which DistributedVectorTest is instantiated
-using DistributedVectorTypes = ::testing::Types<
-   DistributedVector< double, Devices::Sequential, int >,
-   DistributedVector< double, Devices::Host, int >
+// types for which DistributedScanTest is instantiated
+using DistributedArrayTypes = ::testing::Types<
+   DistributedArray< double, Devices::Sequential, int >,
+   DistributedArray< double, Devices::Host, int >
 #ifdef HAVE_CUDA
    ,
-   DistributedVector< double, Devices::Cuda, int >
+   DistributedArray< double, Devices::Cuda, int >
 #endif
 >;
 
-TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
+TYPED_TEST_SUITE( DistributedScanTest, DistributedArrayTypes );
 
 // TODO: test that horizontal operations are computed for ghost values without synchronization
 
-TYPED_TEST( DistributedVectorTest, scan )
+TYPED_TEST( DistributedScanTest, inclusiveScan )
 {
-   using RealType = typename TestFixture::DistributedVectorType::RealType;
-   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
-   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+   using ValueType = typename TestFixture::DistributedArrayType::ValueType;
+   using DeviceType = typename TestFixture::DistributedArrayType::DeviceType;
+   using IndexType = typename TestFixture::DistributedArrayType::IndexType;
 
    auto& v = this->v;
    auto& v_view = this->v_view;
    auto& v_host = this->v_host;
    const auto localRange = v.getLocalRange();
 
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
    setConstantSequence( v, 0 );
-   v_host = -1;
-   v.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
-   v_host = -1;
-   v.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
-   v_host = -1;
-   v.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
    // test views
    setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
-   v_host = -1;
-   v_view.scan();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -152,67 +144,67 @@ TYPED_TEST( DistributedVectorTest, scan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
 
       setLinearSequence( v );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
       // test views
       setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
 
       setLinearSequence( v );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
 
-TYPED_TEST( DistributedVectorTest, exclusiveScan )
+TYPED_TEST( DistributedScanTest, exclusiveScan )
 {
-   using RealType = typename TestFixture::DistributedVectorType::RealType;
-   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
-   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+   using ValueType = typename TestFixture::DistributedArrayType::ValueType;
+   using DeviceType = typename TestFixture::DistributedArrayType::DeviceType;
+   using IndexType = typename TestFixture::DistributedArrayType::IndexType;
 
    auto& v = this->v;
    auto& v_view = this->v_view;
@@ -220,48 +212,48 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
    const auto localRange = v.getLocalRange();
 
    // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
+   if( std::is_same< ValueType, float >::value )
       return;
 
    setConstantSequence( v, 0 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
    // test views
    setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
+   v_host.setValue( -1 );
+   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -271,58 +263,58 @@ TYPED_TEST( DistributedVectorTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
 
       setLinearSequence( v );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
       // test views
       setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
 
       setLinearSequence( v );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host.setValue( -1 );
+      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Algorithms/DistributedScanTestCuda.cu b/src/UnitTests/Algorithms/DistributedScanTestCuda.cu
new file mode 100644
index 000000000..9c78e1ef9
--- /dev/null
+++ b/src/UnitTests/Algorithms/DistributedScanTestCuda.cu
@@ -0,0 +1 @@
+#include "DistributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/ScanTest.cpp b/src/UnitTests/Algorithms/ScanTest.cpp
new file mode 100644
index 000000000..ac886b753
--- /dev/null
+++ b/src/UnitTests/Algorithms/ScanTest.cpp
@@ -0,0 +1 @@
+#include "ScanTest.h"
diff --git a/src/UnitTests/Algorithms/ScanTest.h b/src/UnitTests/Algorithms/ScanTest.h
new file mode 100644
index 000000000..60a91a1e0
--- /dev/null
+++ b/src/UnitTests/Algorithms/ScanTest.h
@@ -0,0 +1,329 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+
+#include <TNL/Arithmetics/Quad.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Scan.h>
+
+#include "../Containers/VectorHelperFunctions.h"
+
+#include "gtest/gtest.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+
+// should be small enough to have fast tests, but larger than minGPUReductionDataSize
+// and large enough to require multiple CUDA blocks for reduction
+constexpr int ARRAY_TEST_SIZE = 10000;
+
+// test fixture for typed tests
+template< typename Array >
+class ScanTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+   using ViewType = ArrayView< typename Array::ValueType, typename Array::DeviceType, typename Array::IndexType >;
+};
+
+// types for which ScanTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< int,            Devices::Sequential, short >,
+   Array< long,           Devices::Sequential, short >,
+   Array< double,         Devices::Sequential, short >,
+   //Array< Quad< float >,  Devices::Sequential, short >,
+   //Array< Quad< double >, Devices::Sequential, short >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< int,            Devices::Host, short >,
+   Array< long,           Devices::Host, short >,
+   Array< double,         Devices::Host, short >,
+   //Array< Quad< float >,  Devices::Host, short >,
+   //Array< Quad< double >, Devices::Host, short >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< int,            Devices::Cuda, short >,
+   Array< long,           Devices::Cuda, short >,
+   Array< double,         Devices::Cuda, short >,
+   //Array< Quad< float >,  Devices::Cuda, short >,
+   //Array< Quad< double >, Devices::Cuda, short >,
+   Array< int,            Devices::Cuda, int >,
+   Array< long,           Devices::Cuda, int >,
+   Array< double,         Devices::Cuda, int >,
+   //Array< Quad< float >,  Devices::Cuda, int >,
+   //Array< Quad< double >, Devices::Cuda, int >,
+   Array< int,            Devices::Cuda, long >,
+   Array< long,           Devices::Cuda, long >,
+   Array< double,         Devices::Cuda, long >
+   //Array< Quad< float >,  Devices::Cuda, long >,
+   //Array< Quad< double >, Devices::Cuda, long >
+#endif
+>;
+
+TYPED_TEST_SUITE( ScanTest, ArrayTypes );
+
+TYPED_TEST( ScanTest, inclusiveScan )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   using ViewType = typename TestFixture::ViewType;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
+   const int size = ARRAY_TEST_SIZE;
+
+   ArrayType v( size );
+   ViewType v_view( v );
+   HostArrayType v_host( size );
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v_view;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v_view;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+      setLinearSequence( v );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+      setLinearSequence( v );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+
+TYPED_TEST( ScanTest, exclusiveScan )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   using ViewType = typename TestFixture::ViewType;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
+   const int size = ARRAY_TEST_SIZE;
+
+   ArrayType v;
+   v.setSize( size );
+   ViewType v_view( v );
+   HostArrayType v_host( size );
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+      setLinearSequence( v );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+      setLinearSequence( v );
+      v_host = -1;
+      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+
+// TODO: test scan with custom begin and end parameters
+
+#endif // HAVE_GTEST
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/ScanTestCuda.cu b/src/UnitTests/Algorithms/ScanTestCuda.cu
new file mode 100644
index 000000000..ac886b753
--- /dev/null
+++ b/src/UnitTests/Algorithms/ScanTestCuda.cu
@@ -0,0 +1 @@
+#include "ScanTest.h"
diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index 9d9e41343..4c3945202 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -8,7 +8,6 @@ set( CPP_TESTS
          StaticVectorOperationsTest
          StaticVectorOfStaticVectorsTest
          VectorTest
-         VectorPrefixSumTest
          VectorEvaluateAndReduceTest
          VectorBinaryOperationsTest
          VectorUnaryOperationsTest
@@ -19,7 +18,6 @@ set( CUDA_TESTS
          ArrayTestCuda
          ArrayViewTestCuda
          VectorTestCuda
-         VectorPrefixSumTestCuda
          VectorEvaluateAndReduceTestCuda
          VectorBinaryOperationsTestCuda
          VectorUnaryOperationsTestCuda
@@ -56,16 +54,6 @@ if( ${BUILD_MPI} )
       TARGET_LINK_LIBRARIES( DistributedArrayTest ${GTEST_BOTH_LIBRARIES} )
    endif()
 
-   ADD_EXECUTABLE( DistributedVectorTest DistributedVectorTest.cpp )
-   TARGET_COMPILE_OPTIONS( DistributedVectorTest PRIVATE ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( DistributedVectorTest ${GTEST_BOTH_LIBRARIES} )
-
-   if( BUILD_CUDA )
-      CUDA_ADD_EXECUTABLE( DistributedVectorTestCuda DistributedVectorTestCuda.cu
-                           OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedVectorTestCuda ${GTEST_BOTH_LIBRARIES} )
-   endif()
-
    ADD_EXECUTABLE( DistributedVectorBinaryOperationsTest DistributedVectorBinaryOperationsTest.cpp )
    TARGET_COMPILE_OPTIONS( DistributedVectorBinaryOperationsTest PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( DistributedVectorBinaryOperationsTest ${GTEST_BOTH_LIBRARIES} )
@@ -93,10 +81,6 @@ if( ${BUILD_MPI} )
    ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters})
    ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters})
-   ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
@@ -110,10 +94,6 @@ if( ${BUILD_MPI} )
    ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    if( BUILD_CUDA )
-      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
-      ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters})
-      ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
-
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
diff --git a/src/UnitTests/Containers/DistributedVectorTest.cpp b/src/UnitTests/Containers/DistributedVectorTest.cpp
deleted file mode 100644
index 5b0c61c85..000000000
--- a/src/UnitTests/Containers/DistributedVectorTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedVectorTest.h"
diff --git a/src/UnitTests/Containers/DistributedVectorTestCuda.cu b/src/UnitTests/Containers/DistributedVectorTestCuda.cu
deleted file mode 100644
index 5b0c61c85..000000000
--- a/src/UnitTests/Containers/DistributedVectorTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedVectorTest.h"
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index 32f2d52ba..f0d67076a 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -30,7 +30,7 @@ void setLinearSequence( Vector& deviceVector )
 
 template< typename Vector >
 void setConstantSequence( Vector& deviceVector,
-                          typename Vector::RealType v )
+                          typename Vector::ValueType v )
 {
    deviceVector.setValue( v );
 }
@@ -38,7 +38,7 @@ void setConstantSequence( Vector& deviceVector,
 template< typename Vector >
 void setOscilatingLinearSequence( Vector& deviceVector )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
    for( int i = 0; i < a.getSize(); i++ )
@@ -48,9 +48,9 @@ void setOscilatingLinearSequence( Vector& deviceVector )
 
 template< typename Vector >
 void setOscilatingConstantSequence( Vector& deviceVector,
-                                    typename Vector::RealType v )
+                                    typename Vector::ValueType v )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
    for( int i = 0; i < a.getSize(); i++ )
@@ -61,7 +61,7 @@ void setOscilatingConstantSequence( Vector& deviceVector,
 template< typename Vector >
 void setNegativeLinearSequence( Vector& deviceVector )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #ifdef DISTRIBUTED_VECTOR
@@ -80,12 +80,12 @@ void setNegativeLinearSequence( Vector& deviceVector )
 
 template< typename Vector >
 void setOscilatingSequence( Vector& deviceVector,
-                            typename Vector::RealType v )
+                            typename Vector::ValueType v )
 {
 #ifdef STATIC_VECTOR
    Vector a;
 #else
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #endif
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.cpp b/src/UnitTests/Containers/VectorPrefixSumTest.cpp
deleted file mode 100644
index 41ae5f65f..000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "VectorPrefixSumTest.h"
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
deleted file mode 100644
index 3bf99d997..000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/***************************************************************************
-                          VectorTest-2.h  -  description
-                             -------------------
-    begin                : Oct 25, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#ifdef HAVE_GTEST
-#include "VectorTestSetup.h"
-
-// should be small enough to have fast tests, but larger than minGPUReductionDataSize
-// and large enough to require multiple CUDA blocks for reduction
-constexpr int VECTOR_TEST_SIZE = 10000;
-
-TYPED_TEST( VectorTest, scan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using HostVectorType = typename VectorType::template Self< RealType, Devices::Sequential >;
-   const int size = VECTOR_TEST_SIZE;
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   VectorType v( size );
-   ViewType v_view( v );
-   HostVectorType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.scan();
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-TYPED_TEST( VectorTest, exclusiveScan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using HostVectorType = typename VectorType::template Self< RealType, Devices::Sequential >;
-   const int size = VECTOR_TEST_SIZE;
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   VectorType v;
-   v.setSize( size );
-   ViewType v_view( v );
-   HostVectorType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-// TODO: test scan with custom begin and end parameters
-
-#endif // HAVE_GTEST
-
-#include "../main.h"
diff --git a/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu b/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu
deleted file mode 100644
index 41ae5f65f..000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "VectorPrefixSumTest.h"
-- 
GitLab


From 624e709fc78dbcb37844a91741913860e49bb80b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 10 Jul 2021 22:19:13 +0200
Subject: [PATCH 17/52] Fixed copy-assignment operator in DistributedArrayView
 according to DistributedVectorView

---
 src/TNL/Containers/DistributedArrayView.hpp | 22 +++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 223ea99c8..40b6f509a 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -375,10 +375,13 @@ operator=( const DistributedArrayView& view )
    TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
-   localData = view.getConstLocalViewWithGhosts();
-   // set, but do not unset, the synchronizer
-   if( view.getSynchronizer() )
-      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      view.waitForSynchronization();
+      getLocalViewWithGhosts() = view.getConstLocalViewWithGhosts();
+   }
    return *this;
 }
 
@@ -394,10 +397,13 @@ operator=( const Array& array )
    TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
-   localData = array.getConstLocalViewWithGhosts();
-   // set, but do not unset, the synchronizer
-   if( array.getSynchronizer() )
-      setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      array.waitForSynchronization();
+      getLocalViewWithGhosts() = array.getConstLocalViewWithGhosts();
+   }
    return *this;
 }
 
-- 
GitLab


From c1780697c1741ae339abd061242aa39389881a16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Jul 2021 13:43:28 +0200
Subject: [PATCH 18/52] Moved implementations of scan and distributed scan into
 the detail namespace

The algorithms are supposed to be used via overloaded plain functions in
the Algorithms namespace: for now, there are only inplaceInclusiveScan
and inplaceExclusiveScan (and their distributed variant).

The scan and segmentedScan methods were removed from data structures
(Vector, VectorView, DistributedVector, DistributedVectorView). They
were inflexible (only std::plus was actually used for reduction),
incomplete (some overloads just threw NotImplementedError), and they
were violating the open-closed principle:
https://en.wikipedia.org/wiki/Open%E2%80%93closed_principle
---
 .../Tutorials/ReductionAndScan/CMakeLists.txt |   6 +-
 .../ReductionAndScan/ExclusiveScanExample.cpp |  48 -----
 .../ReductionAndScan/ExclusiveScanExample.cu  |   1 -
 .../ReductionAndScan/ScanExample.cpp          |  47 -----
 .../Tutorials/ReductionAndScan/ScanExample.cu |   1 -
 .../ReductionAndScan/SegmentedScanExample.cpp |  22 +--
 .../inplaceExclusiveScanExample.cpp           |  31 ++++
 .../inplaceExclusiveScanExample.cu            |   1 +
 .../inplaceInclusiveScanExample.cpp           |  31 ++++
 .../inplaceInclusiveScanExample.cu            |   1 +
 .../tutorial_ReductionAndScan.md              |  32 ++--
 src/Benchmarks/BLAS/vector-operations.h       |   9 +-
 .../ReferenceFormats/Legacy/BiEllpack_impl.h  |   4 +-
 .../SpMV/ReferenceFormats/Legacy/CSR_impl.h   |   3 +-
 .../Legacy/ChunkedEllpack_impl.h              |   3 +-
 .../Legacy/SlicedEllpack_impl.h               |   3 +-
 src/TNL/Algorithms/SegmentedScan.h            |  14 +-
 src/TNL/Algorithms/SegmentedScan.hpp          |   8 +-
 src/TNL/Algorithms/Segments/BiEllpack.hpp     |   3 +-
 .../Algorithms/Segments/ChunkedEllpack.hpp    |   5 +-
 src/TNL/Algorithms/Segments/SlicedEllpack.hpp |   3 +-
 src/TNL/Algorithms/Segments/details/CSR.h     |   3 +-
 src/TNL/Algorithms/detail/CudaScanKernel.h    |   1 +
 .../Algorithms/{ => detail}/DistributedScan.h |  25 +--
 src/TNL/Algorithms/{ => detail}/Scan.h        | 119 +------------
 src/TNL/Algorithms/{ => detail}/Scan.hpp      |   6 +-
 src/TNL/Algorithms/detail/ScanType.h          |  26 +++
 src/TNL/Algorithms/distributedScan.h          | 150 ++++++++++++++++
 src/TNL/Algorithms/scan.h                     | 164 ++++++++++++++++++
 src/TNL/Containers/DistributedVector.h        |   3 -
 src/TNL/Containers/DistributedVector.hpp      |  13 --
 src/TNL/Containers/DistributedVectorView.h    |   3 -
 src/TNL/Containers/DistributedVectorView.hpp  |  15 --
 src/TNL/Containers/Vector.h                   |  80 ---------
 src/TNL/Containers/Vector.hpp                 |  56 ------
 src/TNL/Containers/VectorView.h               |  82 ---------
 src/TNL/Containers/VectorView.hpp             |  52 ------
 .../DistributedMeshSynchronizer.h             |   3 +-
 .../DistributedMeshes/distributeSubentities.h |   3 +-
 src/UnitTests/Algorithms/CMakeLists.txt       |  26 +--
 .../Algorithms/DistributedScanTest.cpp        |   1 -
 .../Algorithms/DistributedScanTestCuda.cu     |   1 -
 src/UnitTests/Algorithms/ScanTest.cpp         |   1 -
 src/UnitTests/Algorithms/ScanTestCuda.cu      |   1 -
 .../Algorithms/distributedScanTest.cpp        |   1 +
 ...ibutedScanTest.h => distributedScanTest.h} |  82 ++++-----
 .../Algorithms/distributedScanTestCuda.cu     |   1 +
 src/UnitTests/Algorithms/scanTest.cpp         |   1 +
 .../Algorithms/{ScanTest.h => scanTest.h}     |  82 ++++-----
 src/UnitTests/Algorithms/scanTestCuda.cu      |   1 +
 .../DistributedMeshes/DistributedMeshTest.h   |   4 +-
 51 files changed, 600 insertions(+), 682 deletions(-)
 delete mode 100644 Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp
 delete mode 120000 Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu
 delete mode 100644 Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
 delete mode 120000 Documentation/Tutorials/ReductionAndScan/ScanExample.cu
 create mode 100644 Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu
 create mode 100644 Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu
 rename src/TNL/Algorithms/{ => detail}/DistributedScan.h (79%)
 rename src/TNL/Algorithms/{ => detail}/Scan.h (51%)
 rename src/TNL/Algorithms/{ => detail}/Scan.hpp (99%)
 create mode 100644 src/TNL/Algorithms/detail/ScanType.h
 create mode 100644 src/TNL/Algorithms/distributedScan.h
 create mode 100644 src/TNL/Algorithms/scan.h
 delete mode 100644 src/UnitTests/Algorithms/DistributedScanTest.cpp
 delete mode 100644 src/UnitTests/Algorithms/DistributedScanTestCuda.cu
 delete mode 100644 src/UnitTests/Algorithms/ScanTest.cpp
 delete mode 100644 src/UnitTests/Algorithms/ScanTestCuda.cu
 create mode 100644 src/UnitTests/Algorithms/distributedScanTest.cpp
 rename src/UnitTests/Algorithms/{DistributedScanTest.h => distributedScanTest.h} (70%)
 create mode 100644 src/UnitTests/Algorithms/distributedScanTestCuda.cu
 create mode 100644 src/UnitTests/Algorithms/scanTest.cpp
 rename src/UnitTests/Algorithms/{ScanTest.h => scanTest.h} (69%)
 create mode 100644 src/UnitTests/Algorithms/scanTestCuda.cu

diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 594ebd8cd..85e762868 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -12,8 +12,8 @@ set( COMMON_EXAMPLES
      MapReduceExample-3
      ReductionWithArgument
      ReductionWithArgumentWithFunctional
-     ScanExample
-     ExclusiveScanExample
+     inplaceInclusiveScanExample
+     inplaceExclusiveScanExample
      SegmentedScanExample
 )
 
@@ -46,4 +46,4 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
 ELSE()
    ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples ALL DEPENDS ${HOST_OUTPUTS} )
-ENDIF()
\ No newline at end of file
+ENDIF()
diff --git a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp
deleted file mode 100644
index 29817aa14..000000000
--- a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-using namespace TNL::Algorithms;
-
-template< typename Device >
-void scan( Vector< double, Device >& v )
-{
-   /***
-    * Reduction is sum of two numbers.
-    */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-
-   /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
-    * where the scan is performed, lambda function which is used by the scan and
-    * zero element (idempotent) of the 'sum' operation.
-    */
-   Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduce, 0.0 );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the exclusive prefix sum with vectors allocated on CPU.
-    */
-   Vector< double, Devices::Host > host_v( 10 );
-   host_v = 1.0;
-   std::cout << "host_v = " << host_v << std::endl;
-   scan( host_v );
-   std::cout << "The exclusive prefix sum of the host vector is " << host_v << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v( 10 );
-   cuda_v = 1.0;
-   std::cout << "cuda_v = " << cuda_v << std::endl;
-   scan( cuda_v );
-   std::cout << "The exclusive prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu
deleted file mode 120000
index 75896ca31..000000000
--- a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ExclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
deleted file mode 100644
index 5281bfd54..000000000
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-using namespace TNL::Algorithms;
-
-template< typename Device >
-void scan( Vector< double, Device >& v )
-{
-   /***
-    * Reduction is sum of two numbers.
-    */
-   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-
-   /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
-    * where the scan is performed, lambda function which is used by the scan and
-    * zero element (idempotent) of the 'sum' operation.
-    */
-   Scan< Device >::perform( v, 0, v.getSize(), reduction, 0.0 );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the prefix sum with vectors allocated on CPU.
-    */
-   Vector< double, Devices::Host > host_v( 10 );
-   host_v = 1.0;
-   std::cout << "host_v = " << host_v << std::endl;
-   scan( host_v );
-   std::cout << "The prefix sum of the host vector is " << host_v << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v( 10 );
-   cuda_v = 1.0;
-   std::cout << "cuda_v = " << cuda_v << std::endl;
-   scan( cuda_v );
-   std::cout << "The prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cu b/Documentation/Tutorials/ReductionAndScan/ScanExample.cu
deleted file mode 120000
index d93679f61..000000000
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
index 5e1379f5d..7cfd43354 100644
--- a/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
@@ -1,13 +1,13 @@
 #include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/SegmentedScan.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
 using namespace TNL::Algorithms;
 
 template< typename Device >
-void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
+void segmentedScan( Array< double, Device >& v, Array< bool, Device >& flags )
 {
    /***
     * Reduction is sum of two numbers.
@@ -15,7 +15,7 @@ void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
+    * As parameters, we pass array on which the scan is to be performed, interval
     * where the scan is performed, lambda function which is used by the scan and
     * zero element (idempotent) of the 'sum' operation.
     */
@@ -25,25 +25,25 @@ void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
 int main( int argc, char* argv[] )
 {
    /***
-    * Firstly, test the segmented prefix sum with vectors allocated on CPU.
+    * Firstly, test the segmented prefix sum with arrays allocated on CPU.
     */
-   Vector< bool, Devices::Host > host_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
-   Vector< double, Devices::Host > host_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
+   Array< bool, Devices::Host > host_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
+   Array< double, Devices::Host > host_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
    std::cout << "host_flags = " << host_flags << std::endl;
    std::cout << "host_v     = " << host_v << std::endl;
    segmentedScan( host_v, host_flags );
-   std::cout << "The segmented prefix sum of the host vector is " << host_v << "." << std::endl;
+   std::cout << "The segmented prefix sum of the host array is " << host_v << "." << std::endl;
 
    /***
     * And then also on GPU.
     */
 #ifdef HAVE_CUDA
-   //Vector< bool, Devices::Cuda > cuda_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
-   //Vector< double, Devices::Cuda > cuda_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
+   //Array< bool, Devices::Cuda > cuda_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
+   //Array< double, Devices::Cuda > cuda_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
    //std::cout << "cuda_flags = " << cuda_flags << std::endl;
    //std::cout << "cuda_v     = " << cuda_v << std::endl;
    //segmentedScan( cuda_v, cuda_flags );
-   //std::cout << "The segmnted prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
+   //std::cout << "The segmnted prefix sum of the CUDA array is " << cuda_v << "." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp
new file mode 100644
index 000000000..012e4bcb3
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_a( 10 );
+   host_a = 1.0;
+   std::cout << "host_a = " << host_a << std::endl;
+   inplaceExclusiveScan( host_a );
+   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_a( 10 );
+   cuda_a = 1.0;
+   std::cout << "cuda_a = " << cuda_a << std::endl;
+   inplaceExclusiveScan( cuda_a );
+   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu
new file mode 120000
index 000000000..b7692b9c7
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu
@@ -0,0 +1 @@
+inplaceExclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp
new file mode 100644
index 000000000..ebf42f247
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_a( 10 );
+   host_a = 1.0;
+   std::cout << "host_a = " << host_a << std::endl;
+   inplaceInclusiveScan( host_a );
+   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_a( 10 );
+   cuda_a = 1.0;
+   std::cout << "cuda_a = " << cuda_a << std::endl;
+   inplaceInclusiveScan( cuda_a );
+   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu
new file mode 120000
index 000000000..3f1794e21
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu
@@ -0,0 +1 @@
+inplaceInclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 35246fe4e..59ab08d3f 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -216,29 +216,35 @@ and exclusive prefix sum of the same sequence is
 [0,1,4,9,16,25,36]
 ```
 
-Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on sumation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in simillar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
+Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on summation, however product or logical operations could be handy as well. In TNL, scan is implemented in similar way as reduction and uses the same functors as the reduction operation. The following example shows how it works:
 
-\includelineno ScanExample.cpp
-
-Scan does not use `fetch` function because the scan must be performed on a vector (the first parameter we pass to the scan). Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the vector elements before the scan, this can be done explicitly and it will not affect the performance significantlty. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element if the operation.
-
-The result looks as:
-
-\include ScanExample.out
+```
+inplaceInclusiveScan( array, 0, array.getSize(), TNL::Plus{} );
+```
 
-Exclusive scan works the same way, we just need to specify it by the second template parameter which is set to `ScanType::Exclusive`. The call of the scan then looks as
+This is equivalent to the following shortened call (the second, third and fourth parameters have a default value):
 
 ```
-Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduction, 0.0 );
+inplaceInclusiveScan( array );
 ```
 
 The complete example looks as follows:
 
-\includelineno ExclusiveScanExample.cpp
+\includelineno inplaceInclusiveScanExample.cpp
+
+Scan does not use `fetch` function because the scan must be performed on an array. Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the array elements before the scan, this can be done explicitly and it will not affect the performance significantly. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element of the operation.
+
+The result looks as:
+
+\include inplaceInclusiveScanExample.out
+
+Exclusive scan works similarly. The complete example looks as follows:
+
+\includelineno inplaceExclusiveScanExample.cpp
 
 And the result looks as:
 
-\include ExclusiveScanExample.out
+\include inplaceExclusiveScanExample.out
 
 ### Segmented scan
 
@@ -272,4 +278,4 @@ In addition to common scan, we need to encode the segments of the input sequence
 
 The result reads as:
 
-\include SegmentedScanExample.out
\ No newline at end of file
+\include SegmentedScanExample.out
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 5531b360d..fc5f1b29e 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -17,6 +17,7 @@
 #include "../Benchmarks.h"
 
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include "CommonVectorOperations.h"
 #include "VectorOperations.h"
 
@@ -566,13 +567,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
    ////
    // Inclusive scan
    auto inclusiveScanHost = [&]() {
-      hostVector.scan();
+      Algorithms::inplaceInclusiveScan( hostVector );
    };
    benchmark.setOperation( "inclusive scan", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", inclusiveScanHost );
 #ifdef HAVE_CUDA
    auto inclusiveScanCuda = [&]() {
-      deviceVector.scan();
+      Algorithms::inplaceInclusiveScan( deviceVector );
    };
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusiveScanCuda );
 #endif
@@ -580,13 +581,13 @@ benchmarkVectorOperations( Benchmark & benchmark,
    ////
    // Exclusive scan
    auto exclusiveScanHost = [&]() {
-      hostVector.template scan< Algorithms::ScanType::Exclusive >();
+      Algorithms::inplaceExclusiveScan( hostVector );
    };
    benchmark.setOperation( "exclusive scan", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", exclusiveScanHost );
 #ifdef HAVE_CUDA
    auto exclusiveScanCuda = [&]() {
-      deviceVector.template scan< Algorithms::ScanType::Exclusive >();
+      Algorithms::inplaceExclusiveScan( deviceVector );
    };
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusiveScanCuda );
 #endif
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
index d33ee47cc..e1f7f8fa3 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
@@ -13,6 +13,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <cstdio>
 
@@ -97,8 +98,7 @@ setCompressedRowLengths( ConstRowsCapacitiesTypeView constRowLengths )
     DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
     DeviceDependentCode::computeColumnSizes( *this, rowLengths );
 
-    //this->groupPointers.computeExclusivePrefixSum();
-    this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+    Algorithms::inplaceExclusiveScan( this->groupPointers );
 
     DeviceDependentCode::verifyRowPerm( *this, rowLengths );
     DeviceDependentCode::verifyRowLengths( *this, rowLengths );
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
index ed5ec486c..2cb2b4784 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
 #include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -102,7 +103,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstRowsC
    rowPtrs.bind( this->rowPointers.getData(), this->getRows() );
    rowPtrs = rowLengths;
    this->rowPointers.setElement( this->rows, 0 );
-   this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->rowPointers );
    this->maxRowLength = max( rowLengths );
 
    /****
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
index df6622777..28bd8313a 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -218,7 +219,7 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstRowsCa
       this->rowPointers.setElement( 0, 0 );
       for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
          this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
-      this->rowPointers.scan();
+      Algorithms::inplaceInclusiveScan( this->rowPointers );
    }
 
    if( std::is_same< Device, Devices::Cuda >::value )
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
index c7127cf1f..4dc0f4480 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -82,7 +83,7 @@ void SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowLengths( C
 
    this->maxRowLength = max( rowLengths );
 
-   this->slicePointers.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->slicePointers );
    this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
 }
 
diff --git a/src/TNL/Algorithms/SegmentedScan.h b/src/TNL/Algorithms/SegmentedScan.h
index f16a5335c..10412b747 100644
--- a/src/TNL/Algorithms/SegmentedScan.h
+++ b/src/TNL/Algorithms/SegmentedScan.h
@@ -12,11 +12,13 @@
 
 #pragma once
 
+// TODO: move this into the detail namespace, create dispatching functions like
+// inplaceInclusiveSegmentedScan, inplaceExclusiveSegmentedScan, etc.
+
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-
-#include "Scan.h"  // only for the ScanType
+#include <TNL/Algorithms/detail/ScanType.h>
 
 namespace TNL {
 namespace Algorithms {
@@ -61,10 +63,10 @@ namespace Algorithms {
  * **Note: Segmented scan is not implemented for CUDA yet.**
  */
 template< typename Device,
-          ScanType Type = ScanType::Inclusive >
+          detail::ScanType Type = detail::ScanType::Inclusive >
 struct SegmentedScan;
 
-template< ScanType Type >
+template< detail::ScanType Type >
 struct SegmentedScan< Devices::Sequential, Type >
 {
    /**
@@ -108,7 +110,7 @@ struct SegmentedScan< Devices::Sequential, Type >
             const typename Vector::ValueType zero );
 };
 
-template< ScanType Type >
+template< detail::ScanType Type >
 struct SegmentedScan< Devices::Host, Type >
 {
    /**
@@ -152,7 +154,7 @@ struct SegmentedScan< Devices::Host, Type >
             const typename Vector::ValueType zero );
 };
 
-template< ScanType Type >
+template< detail::ScanType Type >
 struct SegmentedScan< Devices::Cuda, Type >
 {
    /**
diff --git a/src/TNL/Algorithms/SegmentedScan.hpp b/src/TNL/Algorithms/SegmentedScan.hpp
index 467146e51..18427a79d 100644
--- a/src/TNL/Algorithms/SegmentedScan.hpp
+++ b/src/TNL/Algorithms/SegmentedScan.hpp
@@ -19,7 +19,7 @@
 namespace TNL {
 namespace Algorithms {
 
-template< ScanType Type >
+template< detail::ScanType Type >
    template< typename Vector,
              typename Reduction,
              typename Flags >
@@ -35,7 +35,7 @@ perform( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
-   if( Type == ScanType::Inclusive )
+   if( Type == detail::ScanType::Inclusive )
    {
       for( IndexType i = begin + 1; i < end; i++ )
          if( ! flags[ i ] )
@@ -56,7 +56,7 @@ perform( Vector& v,
    }
 }
 
-template< ScanType Type >
+template< detail::ScanType Type >
    template< typename Vector,
              typename Reduction,
              typename Flags >
@@ -77,7 +77,7 @@ perform( Vector& v,
 #endif
 }
 
-template< ScanType Type >
+template< detail::ScanType Type >
    template< typename Vector,
              typename Reduction,
              typename Flags >
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index d0847b6a3..9eb71956a 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -13,6 +13,7 @@
 #include <math.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -345,7 +346,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
       this->performRowBubbleSort( segmentsSizes );
       this->computeColumnSizes( segmentsSizes );
 
-      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+      inplaceExclusiveScan( this->groupPointers );
 
       this->verifyRowPerm( segmentsSizes );
       //this->verifyRowLengths( segmentsSizes ); // TODO: I am not sure what this test is doing.
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 69e6b4c67..fd3df2053 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -12,6 +12,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -37,7 +38,7 @@ ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 ChunkedEllpack( const ChunkedEllpack& chunkedEllpack )
    : size( chunkedEllpack.size ),
      storageSize( chunkedEllpack.storageSize ),
-     chunksInSlice( chunkedEllpack.chunksInSlice ), 
+     chunksInSlice( chunkedEllpack.chunksInSlice ),
      desiredChunkSize( chunkedEllpack.desiredChunkSize ),
      rowToChunkMapping( chunkedEllpack.rowToChunkMapping ),
      rowToSliceMapping( chunkedEllpack.rowToSliceMapping ),
@@ -273,7 +274,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
       this->storageSize = 0;
       for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
          this->setSlice( segmentsSizes, sliceIndex, storageSize );
-      this->rowPointers.scan();
+      inplaceInclusiveScan( this->rowPointers );
       IndexType chunksCount = this->numberOfSlices * this->chunksInSlice;
       this->chunksToSegmentsMapping.setSize( chunksCount );
       IndexType chunkIdx( 0 );
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 82e7a8571..4482cd567 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -12,6 +12,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -152,7 +153,7 @@ setSegmentsSizes( const SizesHolder& sizes )
       slice_segment_size_view[ i ] = res;
    };
    ellpack.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
-   this->sliceOffsets.template scan< Algorithms::ScanType::Exclusive >();
+   inplaceExclusiveScan( this->sliceOffsets );
    this->size = sum( sizes );
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
diff --git a/src/TNL/Algorithms/Segments/details/CSR.h b/src/TNL/Algorithms/Segments/details/CSR.h
index b9392815d..193758f70 100644
--- a/src/TNL/Algorithms/Segments/details/CSR.h
+++ b/src/TNL/Algorithms/Segments/details/CSR.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/Algorithms/scan.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -35,7 +36,7 @@ class CSR
             view = sizes;
          }
          offsets.setElement( sizes.getSize(), 0 );
-         offsets.template scan< Algorithms::ScanType::Exclusive >();
+         inplaceExclusiveScan( offsets );
       }
 
       template< typename CSROffsets >
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index ddce0c6ea..7338f56ff 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -16,6 +16,7 @@
 #include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Containers/Array.h>
+#include "ScanType.h"
 
 namespace TNL {
 namespace Algorithms {
diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/detail/DistributedScan.h
similarity index 79%
rename from src/TNL/Algorithms/DistributedScan.h
rename to src/TNL/Algorithms/detail/DistributedScan.h
index d6e60949c..1046f3df6 100644
--- a/src/TNL/Algorithms/DistributedScan.h
+++ b/src/TNL/Algorithms/detail/DistributedScan.h
@@ -12,27 +12,29 @@
 
 #pragma once
 
-#include <TNL/Algorithms/Scan.h>
-#include <TNL/Containers/Vector.h>
+#include "Scan.h"
+
+#include <TNL/Containers/Array.h>
 #include <TNL/MPI/Wrappers.h>
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 template< ScanType Type >
 struct DistributedScan
 {
-   template< typename DistributedVector,
+   template< typename DistributedArray,
              typename Reduction >
    static void
-   perform( DistributedVector& v,
-            typename DistributedVector::IndexType begin,
-            typename DistributedVector::IndexType end,
-            const Reduction& reduction,
-            const typename DistributedVector::ValueType zero )
+   perform( DistributedArray& v,
+            typename DistributedArray::IndexType begin,
+            typename DistributedArray::IndexType end,
+            Reduction&& reduction,
+            typename DistributedArray::ValueType zero )
    {
-      using ValueType = typename DistributedVector::ValueType;
-      using DeviceType = typename DistributedVector::DeviceType;
+      using ValueType = typename DistributedArray::ValueType;
+      using DeviceType = typename DistributedArray::DeviceType;
 
       const auto group = v.getCommunicationGroup();
       if( group != MPI::NullGroup() ) {
@@ -50,7 +52,7 @@ struct DistributedScan
          const int nproc = MPI::GetSize( group );
          ValueType dataForScatter[ nproc ];
          for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result;
-         Containers::Vector< ValueType, Devices::Host > rank_results( nproc );
+         Containers::Array< ValueType, Devices::Host > rank_results( nproc );
          // NOTE: exchanging general data types does not work with MPI
          MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
 
@@ -64,5 +66,6 @@ struct DistributedScan
    }
 };
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/detail/Scan.h
similarity index 51%
rename from src/TNL/Algorithms/Scan.h
rename to src/TNL/Algorithms/detail/Scan.h
index 348364802..5fee86b60 100644
--- a/src/TNL/Algorithms/Scan.h
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -15,74 +15,18 @@
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/detail/ScanType.h>
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
-/**
- * \brief Scan (or prefix sum) type - inclusive or exclusive.
- *
- * See \ref TNL::Algorithms::Scan.
- */
-enum class ScanType {
-   Exclusive,
-   Inclusive
-};
-
-/**
- * \brief Computes scan (or prefix sum) on a vector.
- *
- * [Scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum) operation turns a sequence
- * \f$a_1, \ldots, a_n\f$ into a sequence \f$s_1, \ldots, s_n\f$ defined as
- *
- * \f[
- * s_i = \sum_{j=1}^i a_i.
- * \f]
- * Exclusive scan (or prefix sum) is defined as
- *
- * \f[
- * \sigma_i = \sum_{j=1}^{i-1} a_i.
- * \f]
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
- *
- * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
- */
-template< typename Device,
-          ScanType Type = ScanType::Inclusive >
+template< typename Device, ScanType Type >
 struct Scan;
 
 template< ScanType Type >
 struct Scan< Devices::Sequential, Type >
 {
-   /**
-    * \brief Computes scan (prefix sum) sequentially.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
    template< typename Vector,
              typename Reduction >
    static void
@@ -116,33 +60,6 @@ struct Scan< Devices::Sequential, Type >
 template< ScanType Type >
 struct Scan< Devices::Host, Type >
 {
-   /**
-    * \brief Computes scan (prefix sum) using OpenMP.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
    template< typename Vector,
              typename Reduction >
    static void
@@ -176,33 +93,6 @@ struct Scan< Devices::Host, Type >
 template< ScanType Type >
 struct Scan< Devices::Cuda, Type >
 {
-   /**
-    * \brief Computes scan (prefix sum) on GPU.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
    template< typename Vector,
              typename Reduction >
    static void
@@ -233,7 +123,8 @@ struct Scan< Devices::Cuda, Type >
                        const typename Vector::ValueType zero );
 };
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/Scan.hpp>
+#include "Scan.hpp"
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
similarity index 99%
rename from src/TNL/Algorithms/Scan.hpp
rename to src/TNL/Algorithms/detail/Scan.hpp
index 17de19a41..5e08bbf3b 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -13,16 +13,17 @@
 #pragma once
 
 #include "Scan.h"
-#include "reduce.h"
+#include "CudaScanKernel.h"
 
 #include <TNL/Assert.h>
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/StaticArray.h>
-#include <TNL/Algorithms/detail/CudaScanKernel.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
 namespace Algorithms {
+namespace detail {
 
 template< ScanType Type >
    template< typename Vector,
@@ -305,5 +306,6 @@ performSecondPhase( Vector& v,
 #endif
 }
 
+} // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/detail/ScanType.h b/src/TNL/Algorithms/detail/ScanType.h
new file mode 100644
index 000000000..b5721ad36
--- /dev/null
+++ b/src/TNL/Algorithms/detail/ScanType.h
@@ -0,0 +1,26 @@
+/***************************************************************************
+                          ScanType.h  -  description
+                             -------------------
+    begin                : May 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+enum class ScanType {
+   Exclusive,
+   Inclusive
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/distributedScan.h b/src/TNL/Algorithms/distributedScan.h
new file mode 100644
index 000000000..afabb8820
--- /dev/null
+++ b/src/TNL/Algorithms/distributedScan.h
@@ -0,0 +1,150 @@
+/***************************************************************************
+                          distributedScan.h  -  description
+                             -------------------
+    begin                : Jul 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::forward
+
+#include <TNL/Algorithms/detail/DistributedScan.h>
+#include <TNL/Functional.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename DistributedArray,
+          typename Reduction >
+void
+distributedInplaceInclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin,
+                                 typename DistributedArray::IndexType end,
+                                 Reduction&& reduction,
+                                 typename DistributedArray::ValueType zero )
+{
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive >;
+   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   array.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInplaceInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename DistributedArray::ValueType >()`.
+ * See \ref distributedInplaceInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename DistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInplaceInclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin = 0,
+                                 typename DistributedArray::IndexType end = 0,
+                                 Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename DistributedArray::ValueType zero = Reduction::template getIdempotent< typename DistributedArray::ValueType >();
+   distributedInplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename DistributedArray,
+          typename Reduction >
+void
+distributedInplaceExclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin,
+                                 typename DistributedArray::IndexType end,
+                                 Reduction&& reduction,
+                                 typename DistributedArray::ValueType zero )
+{
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive >;
+   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   array.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInplaceExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename DistributedArray::ValueType >()`.
+ * See \ref distributedInplaceExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename DistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInplaceExclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin = 0,
+                                 typename DistributedArray::IndexType end = 0,
+                                 Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename DistributedArray::ValueType zero = Reduction::template getIdempotent< typename DistributedArray::ValueType >();
+   distributedInplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
new file mode 100644
index 000000000..7836cf231
--- /dev/null
+++ b/src/TNL/Algorithms/scan.h
@@ -0,0 +1,164 @@
+/***************************************************************************
+                          scan.h  -  description
+                             -------------------
+    begin                : Jul 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::forward
+
+#include <TNL/Algorithms/detail/Scan.h>
+#include <TNL/Functional.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of an array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam Array type of the array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inplaceInclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inplaceInclusiveScanExample.out
+ */
+template< typename Array,
+          typename Reduction >
+void
+inplaceInclusiveScan( Array& array,
+                      typename Array::IndexType begin,
+                      typename Array::IndexType end,
+                      Reduction&& reduction,
+                      typename Array::ValueType zero )
+{
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive >;
+   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Overload of \ref inplaceInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename Array::ValueType >()`.
+ * See \ref inplaceInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename Array,
+          typename Reduction = TNL::Plus >
+void
+inplaceInclusiveScan( Array& array,
+                      typename Array::IndexType begin = 0,
+                      typename Array::IndexType end = 0,
+                      Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename Array::ValueType zero = Reduction::template getIdempotent< typename Array::ValueType >();
+   inplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of an array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam Array type of the array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inplaceExclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inplaceExclusiveScanExample.out
+ */
+template< typename Array,
+          typename Reduction >
+void
+inplaceExclusiveScan( Array& array,
+                      typename Array::IndexType begin,
+                      typename Array::IndexType end,
+                      Reduction&& reduction,
+                      typename Array::ValueType zero )
+{
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive >;
+   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Overload of \ref inplaceExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename Array::ValueType >()`.
+ * See \ref inplaceExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename Array,
+          typename Reduction = TNL::Plus >
+void
+inplaceExclusiveScan( Array& array,
+                      typename Array::IndexType begin = 0,
+                      typename Array::IndexType end = 0,
+                      Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename Array::ValueType zero = Reduction::template getIdempotent< typename Array::ValueType >();
+   inplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 8d737e3a9..beb9840b4 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -174,9 +174,6 @@ public:
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVector& operator/=( const Vector& vector );
-
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index 044b747d9..72be20d0a 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVector.h"
-#include <TNL/Algorithms/DistributedScan.h>
 
 namespace TNL {
 namespace Containers {
@@ -250,17 +249,5 @@ operator/=( Scalar c )
    return *this;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type >
-void
-DistributedVector< Real, Device, Index, Allocator >::
-scan( IndexType begin, IndexType end )
-{
-   getView().template scan< Type >( begin, end );
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 4a46a47ce..4ceef1e7d 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -145,9 +145,6 @@ public:
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVectorView& operator/=( const Vector& vector );
-
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 2f9222f94..181270a35 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVectorView.h"
-#include <TNL/Algorithms/DistributedScan.h>
 
 namespace TNL {
 namespace Containers {
@@ -288,19 +287,5 @@ operator/=( Scalar c )
    return *this;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type >
-void
-DistributedVectorView< Real, Device, Index >::
-scan( IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-   this->startSynchronization();
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 859e326d4..0bb991a67 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -256,86 +256,6 @@ public:
     */
    template< typename VectorExpression >
    Vector& operator/=( const VectorExpression& expression );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector elements.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of the vector elements.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename FlagsArray >
-   void segmentedScan( FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector expression.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression >
-   void scan( const VectorExpression& expression, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of a vector expression.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression,
-             typename FlagsArray >
-   void segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
 };
 
 // Enable expression templates for Vector
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index b25ccbb5a..f204368de 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -153,61 +153,5 @@ operator/=( const VectorExpression& expression )
    return *this;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type >
-void
-Vector< Real, Device, Index, Allocator >::
-scan( IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::Scan< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename FlagsArray >
-void
-Vector< Real, Device, Index, Allocator >::
-segmentedScan( FlagsArray& flags, IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::SegmentedScan< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression >
-void
-Vector< Real, Device, Index, Allocator >::
-scan( const VectorExpression& expression, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Scan (prefix sum) with vector expressions is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression,
-             typename FlagsArray >
-void
-Vector< Real, Device, Index, Allocator >::
-segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) with vector expressions is not implemented." );
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 55983f43e..90f98028f 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -14,8 +14,6 @@
 
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
-#include <TNL/Algorithms/Scan.h>
-#include <TNL/Algorithms/SegmentedScan.h>
 
 namespace TNL {
 namespace Containers {
@@ -215,86 +213,6 @@ public:
     */
    template< typename VectorExpression >
    VectorView& operator/=( const VectorExpression& expression );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector elements.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of the vector elements.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename FlagsArray >
-   void segmentedScan( FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector expression.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression >
-   void scan( const VectorExpression& expression, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of a vector expression.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression,
-             typename FlagsArray >
-   void segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
 };
 
 // Enable expression templates for VectorView
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 2c1cd02c8..034f362dc 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -102,57 +102,5 @@ operator/=( const VectorExpression& expression )
    return *this;
 }
 
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type >
-void
-VectorView< Real, Device, Index >::
-scan( IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::Scan< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename FlagsArray >
-void
-VectorView< Real, Device, Index >::
-segmentedScan( FlagsArray& flags, IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::SegmentedScan< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression >
-void
-VectorView< Real, Device, Index >::
-scan( const VectorExpression& expression, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Scan (prefix sum) with vector expressions is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression,
-             typename FlagsArray >
-void
-VectorView< Real, Device, Index >::
-segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) with vector expressions is not implemented." );
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 36f28ba45..0353ded09 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -12,6 +12,7 @@
 
 #pragma once
 
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Containers/ByteArraySynchronizer.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Matrices/DenseMatrix.h>
@@ -383,7 +384,7 @@ public:
             // scan the rowPointers array to convert
             Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView;
             rowPointersView.bind( recv_rowPointers );
-            rowPointersView.template scan< Algorithms::ScanType::Exclusive >();
+            Algorithms::inplaceExclusiveScan( rowPointersView );
          }
 
          // allocate column indices
diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
index 120cadf80..38cc5ccdf 100644
--- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
+++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/MeshDetails/layers/EntityTags/Traits.h>
+#include <TNL/Algorithms/scan.h>
 
 namespace TNL {
 namespace Meshes {
@@ -238,7 +239,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
                      globalOffsets.getData(), 1,
                      mesh.getCommunicationGroup() );
    }
-   globalOffsets.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( globalOffsets );
 
    // 3. assign global indices to the local entities and a padding index to ghost entities
    //    (later we can check the padding index to know if an index was set or not)
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 92f58def4..628ca8dee 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -11,11 +11,11 @@ set( COMMON_TESTS
 )
 
 set( CPP_TESTS
-         ScanTest
+         scanTest
          SegmentedScanTest
 )
 set( CUDA_TESTS
-         ScanTestCuda
+         scanTestCuda
 )
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} ${COMMON_TESTS} )
@@ -40,23 +40,23 @@ endif()
 
 
 if( ${BUILD_MPI} )
-   ADD_EXECUTABLE( DistributedScanTest DistributedScanTest.cpp )
-   TARGET_COMPILE_OPTIONS( DistributedScanTest PRIVATE ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( DistributedScanTest ${GTEST_BOTH_LIBRARIES} )
+   ADD_EXECUTABLE( distributedScanTest distributedScanTest.cpp )
+   TARGET_COMPILE_OPTIONS( distributedScanTest PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( distributedScanTest ${GTEST_BOTH_LIBRARIES} )
 
    if( BUILD_CUDA )
-      CUDA_ADD_EXECUTABLE( DistributedScanTestCuda DistributedScanTestCuda.cu
+      CUDA_ADD_EXECUTABLE( distributedScanTestCuda distributedScanTestCuda.cu
                            OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedScanTestCuda ${GTEST_BOTH_LIBRARIES} )
+      TARGET_LINK_LIBRARIES( distributedScanTestCuda ${GTEST_BOTH_LIBRARIES} )
    endif()
 
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME DistributedScanTest COMMAND "mpirun" ${mpi_test_parameters})
-   ADD_TEST( NAME DistributedScanTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME distributedScanTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME distributedScanTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    if( BUILD_CUDA )
-      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
-      ADD_TEST( NAME DistributedScanTestCuda COMMAND "mpirun" ${mpi_test_parameters})
-      ADD_TEST( NAME DistributedScanTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+      ADD_TEST( NAME distributedScanTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME distributedScanTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
    endif()
 endif()
diff --git a/src/UnitTests/Algorithms/DistributedScanTest.cpp b/src/UnitTests/Algorithms/DistributedScanTest.cpp
deleted file mode 100644
index 9c78e1ef9..000000000
--- a/src/UnitTests/Algorithms/DistributedScanTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/DistributedScanTestCuda.cu b/src/UnitTests/Algorithms/DistributedScanTestCuda.cu
deleted file mode 100644
index 9c78e1ef9..000000000
--- a/src/UnitTests/Algorithms/DistributedScanTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/ScanTest.cpp b/src/UnitTests/Algorithms/ScanTest.cpp
deleted file mode 100644
index ac886b753..000000000
--- a/src/UnitTests/Algorithms/ScanTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "ScanTest.h"
diff --git a/src/UnitTests/Algorithms/ScanTestCuda.cu b/src/UnitTests/Algorithms/ScanTestCuda.cu
deleted file mode 100644
index ac886b753..000000000
--- a/src/UnitTests/Algorithms/ScanTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "ScanTest.h"
diff --git a/src/UnitTests/Algorithms/distributedScanTest.cpp b/src/UnitTests/Algorithms/distributedScanTest.cpp
new file mode 100644
index 000000000..e1b60321b
--- /dev/null
+++ b/src/UnitTests/Algorithms/distributedScanTest.cpp
@@ -0,0 +1 @@
+#include "distributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/DistributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
similarity index 70%
rename from src/UnitTests/Algorithms/DistributedScanTest.h
rename to src/UnitTests/Algorithms/distributedScanTest.h
index 5a15187bb..17f498229 100644
--- a/src/UnitTests/Algorithms/DistributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -8,7 +8,7 @@
 #include <TNL/Containers/DistributedArray.h>
 #include <TNL/Containers/DistributedArrayView.h>
 #include <TNL/Containers/Partitioner.h>
-#include <TNL/Algorithms/DistributedScan.h>
+#include <TNL/Algorithms/distributedScan.h>
 
 #define DISTRIBUTED_VECTOR
 #include "../Containers/VectorHelperFunctions.h"
@@ -98,21 +98,21 @@ TYPED_TEST( DistributedScanTest, inclusiveScan )
 
    setConstantSequence( v, 0 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -120,21 +120,21 @@ TYPED_TEST( DistributedScanTest, inclusiveScan )
    // test views
    setConstantSequence( v, 0 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -144,28 +144,28 @@ TYPED_TEST( DistributedScanTest, inclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
 
       setLinearSequence( v );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -173,29 +173,29 @@ TYPED_TEST( DistributedScanTest, inclusiveScan )
       // test views
       setConstantSequence( v, 0 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
 
       setLinearSequence( v );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Inclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -217,21 +217,21 @@ TYPED_TEST( DistributedScanTest, exclusiveScan )
 
    setConstantSequence( v, 0 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -239,21 +239,21 @@ TYPED_TEST( DistributedScanTest, exclusiveScan )
    // test views
    setConstantSequence( v, 0 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host.setValue( -1 );
-   DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -263,28 +263,28 @@ TYPED_TEST( DistributedScanTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
 
       setLinearSequence( v );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -292,29 +292,29 @@ TYPED_TEST( DistributedScanTest, exclusiveScan )
       // test views
       setConstantSequence( v, 0 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], 0 );
 
       setConstantSequence( v, 1 );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], i );
 
       setLinearSequence( v );
       v_host.setValue( -1 );
-      DistributedScan< ScanType::Exclusive >::perform( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Algorithms/distributedScanTestCuda.cu b/src/UnitTests/Algorithms/distributedScanTestCuda.cu
new file mode 100644
index 000000000..e1b60321b
--- /dev/null
+++ b/src/UnitTests/Algorithms/distributedScanTestCuda.cu
@@ -0,0 +1 @@
+#include "distributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/scanTest.cpp b/src/UnitTests/Algorithms/scanTest.cpp
new file mode 100644
index 000000000..17cdb1d80
--- /dev/null
+++ b/src/UnitTests/Algorithms/scanTest.cpp
@@ -0,0 +1 @@
+#include "scanTest.h"
diff --git a/src/UnitTests/Algorithms/ScanTest.h b/src/UnitTests/Algorithms/scanTest.h
similarity index 69%
rename from src/UnitTests/Algorithms/ScanTest.h
rename to src/UnitTests/Algorithms/scanTest.h
index 60a91a1e0..c0e8f7118 100644
--- a/src/UnitTests/Algorithms/ScanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -4,7 +4,7 @@
 
 #include <TNL/Arithmetics/Quad.h>
 #include <TNL/Containers/Array.h>
-#include <TNL/Algorithms/Scan.h>
+#include <TNL/Algorithms/scan.h>
 
 #include "../Containers/VectorHelperFunctions.h"
 
@@ -101,21 +101,21 @@ TYPED_TEST( ScanTest, inclusiveScan )
 
    setConstantSequence( v, 0 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -123,21 +123,21 @@ TYPED_TEST( ScanTest, inclusiveScan )
    // test views
    setConstantSequence( v, 0 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v_view;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -147,28 +147,28 @@ TYPED_TEST( ScanTest, inclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
       setConstantSequence( v, 1 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
@@ -176,29 +176,29 @@ TYPED_TEST( ScanTest, inclusiveScan )
       // test views
       setConstantSequence( v, 0 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
       setConstantSequence( v, 1 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Inclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
+      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -220,21 +220,21 @@ TYPED_TEST( ScanTest, exclusiveScan )
 
    setConstantSequence( v, 0 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -242,21 +242,21 @@ TYPED_TEST( ScanTest, exclusiveScan )
    // test views
    setConstantSequence( v, 0 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
    setConstantSequence( v, 1 );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
-   Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
    v_host = v;
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -266,28 +266,28 @@ TYPED_TEST( ScanTest, exclusiveScan )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 0 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
       setConstantSequence( v, 1 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
@@ -295,29 +295,29 @@ TYPED_TEST( ScanTest, exclusiveScan )
       // test views
       setConstantSequence( v, 0 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
       setConstantSequence( v, 1 );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
-      Algorithms::Scan< DeviceType, ScanType::Exclusive >::perform( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
+      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
+      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
+      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Algorithms/scanTestCuda.cu b/src/UnitTests/Algorithms/scanTestCuda.cu
new file mode 100644
index 000000000..17cdb1d80
--- /dev/null
+++ b/src/UnitTests/Algorithms/scanTestCuda.cu
@@ -0,0 +1 @@
+#include "scanTest.h"
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index a0eddd162..f09f22287 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -403,8 +403,8 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
       }
       vert_offsets.setElement( distributor.nproc, 0 );
       cell_offsets.setElement( distributor.nproc, 0 );
-      vert_offsets.template scan< Algorithms::ScanType::Exclusive >();
-      cell_offsets.template scan< Algorithms::ScanType::Exclusive >();
+      Algorithms::inplaceExclusiveScan( vert_offsets );
+      Algorithms::inplaceExclusiveScan( cell_offsets );
       EXPECT_EQ( vert_offsets[ distributor.rank ], mesh.template getGlobalIndices< 0 >()[ 0 ] );
       EXPECT_EQ( cell_offsets[ distributor.rank ], mesh.template getGlobalIndices< 2 >()[ 0 ] );
 
-- 
GitLab


From 42734a75ba565e68bb41acff0b182a21234d64ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Jul 2021 14:01:32 +0200
Subject: [PATCH 19/52] Fixed header includes

---
 src/TNL/Algorithms/Segments/BiEllpackView.hpp     | 15 ++++++++-------
 .../Algorithms/Segments/ChunkedEllpackView.hpp    |  1 +
 src/TNL/Algorithms/detail/CudaScanKernel.h        |  5 -----
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index e861e8f76..0fae90d07 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -15,6 +15,7 @@
 #include <TNL/Algorithms/Segments/BiEllpackView.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
 //#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -391,9 +392,9 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
                globalIdx += inStripIdx;
             for( IndexType j = 0; j < groupWidth && compute; j++ )
             {
-               //std::cerr << "    segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx 
+               //std::cerr << "    segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx
                //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
-               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx 
+               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx
                //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
                aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
                if( Organization == RowMajorOrder )
@@ -496,7 +497,7 @@ printStructure( std::ostream& str ) const
       {
          const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx );
          const IndexType groupWidth = groupSize / groupHeight;
-         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight 
+         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight
              << " offset = " << groupPointers.getElement( groupIdx ) << std::endl;
          groupHeight /= 2;
       }
@@ -607,13 +608,13 @@ segmentsReductionKernel( IndexType gridIdx,
 
    /////
    // Fetch group pointers to shared memory
-   //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip ); 
+   //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip );
    //bool b2 = ( firstGroupIdx + threadIdx.x % groupsInStrip < this->groupPointers.getSize() );
    //printf( "tid = %d warpsCount * groupsInStrip = %d firstGroupIdx + threadIdx.x = %d this->groupPointers.getSize() = %d read = %d %d\n",
    //   threadIdx.x, warpsCount * groupsInStrip,
    //   firstGroupIdx + threadIdx.x,
    //   this->groupPointers.getSize(), ( int ) b1, ( int ) b2 );
-   if( threadIdx.x <= warpsCount * groupsInStrip && 
+   if( threadIdx.x <= warpsCount * groupsInStrip &&
       firstGroupInBlock + threadIdx.x < this->groupPointers.getSize() )
    {
       sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupInBlock + threadIdx.x ];
@@ -634,7 +635,7 @@ segmentsReductionKernel( IndexType gridIdx,
          IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ];
          TNL_ASSERT_LT( groupBegin, this->getStorageSize(), "" );
          //if( groupBegin >= this->getStorageSize() )
-         //   printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n", 
+         //   printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n",
          //      threadIdx.x, sharedGroupOffset + group + 1, strip, group, groupBegin, groupEnd, this->getStorageSize() );
          TNL_ASSERT_LT( groupEnd, this->getStorageSize(), "" );
          if( groupEnd - groupBegin > 0 )
@@ -675,7 +676,7 @@ segmentsReductionKernel( IndexType gridIdx,
             {
                temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
                //if( strip == 1 )
-               //   printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] );               
+               //   printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] );
                globalIdx += getWarpSize();
             }
             // TODO: reduction via templates
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 147b362d1..2b827307d 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -15,6 +15,7 @@
 #include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
 #include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
 //#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 7338f56ff..81379ce59 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <iostream>
-
 #include <TNL/Math.h>
 #include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
@@ -246,7 +244,6 @@ struct CudaScanKernelLauncher
       const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
       const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
       // allocate array for the block results
       Containers::Array< Real, Devices::Cuda > blockResults;
@@ -260,7 +257,6 @@ struct CudaScanKernelLauncher
          Index currentSize = size - gridOffset;
          if( currentSize / elementsInBlock > maxGridSize() )
             currentSize = maxGridSize() * elementsInBlock;
-         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
 
          // setup block and grid size
          dim3 cudaBlockSize, cudaGridSize;
@@ -343,7 +339,6 @@ struct CudaScanKernelLauncher
          Index currentSize = size - gridOffset;
          if( currentSize / elementsInBlock > maxGridSize() )
             currentSize = maxGridSize() * elementsInBlock;
-         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
 
          // setup block and grid size
          dim3 cudaBlockSize, cudaGridSize;
-- 
GitLab


From c44b1140db5ba8f8f744d2e328e4622aeb3f9251 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Jul 2021 18:30:35 +0200
Subject: [PATCH 20/52] Segments: renamed namespace details to detail

The latter is the standard name for it and it is hidden from the
generated documentation of the public interface.
---
 src/TNL/Algorithms/Segments/BiEllpack.hpp     |  8 ++--
 src/TNL/Algorithms/Segments/BiEllpackView.h   |  4 +-
 src/TNL/Algorithms/Segments/BiEllpackView.hpp | 38 +++++++++----------
 src/TNL/Algorithms/Segments/CSR.hpp           |  8 ++--
 .../Algorithms/Segments/CSRAdaptiveKernel.h   |  8 ++--
 .../Algorithms/Segments/CSRAdaptiveKernel.hpp | 28 +++++++-------
 .../Segments/CSRAdaptiveKernelView.h          | 10 ++---
 .../Segments/CSRAdaptiveKernelView.hpp        | 24 ++++++------
 src/TNL/Algorithms/Segments/CSRHybridKernel.h |  2 +-
 .../Algorithms/Segments/CSRHybridKernel.hpp   |  4 +-
 src/TNL/Algorithms/Segments/CSRScalarKernel.h |  2 +-
 .../Algorithms/Segments/CSRScalarKernel.hpp   |  6 +--
 src/TNL/Algorithms/Segments/CSRVectorKernel.h |  2 +-
 .../Algorithms/Segments/CSRVectorKernel.hpp   |  4 +-
 src/TNL/Algorithms/Segments/CSRView.hpp       |  8 ++--
 src/TNL/Algorithms/Segments/ChunkedEllpack.h  |  2 +-
 .../Algorithms/Segments/ChunkedEllpack.hpp    |  4 +-
 .../Algorithms/Segments/ChunkedEllpackView.h  |  6 +--
 .../Segments/ChunkedEllpackView.hpp           | 34 ++++++++---------
 src/TNL/Algorithms/Segments/EllpackView.hpp   |  8 ++--
 .../Algorithms/Segments/SlicedEllpackView.hpp |  8 ++--
 .../Segments/{details => detail}/BiEllpack.h  |  8 ++--
 .../Segments/{details => detail}/CSR.h        |  4 +-
 .../CSRAdaptiveKernelBlockDescriptor.h        |  4 +-
 .../CSRAdaptiveKernelParameters.h             |  4 +-
 .../{details => detail}/CheckLambdas.h        |  4 +-
 .../{details => detail}/ChunkedEllpack.h      | 10 ++---
 .../{details => detail}/LambdaAdapter.h       |  4 +-
 28 files changed, 128 insertions(+), 128 deletions(-)
 rename src/TNL/Algorithms/Segments/{details => detail}/BiEllpack.h (98%)
 rename src/TNL/Algorithms/Segments/{details => detail}/CSR.h (98%)
 rename src/TNL/Algorithms/Segments/{details => detail}/CSRAdaptiveKernelBlockDescriptor.h (99%)
 rename src/TNL/Algorithms/Segments/{details => detail}/CSRAdaptiveKernelParameters.h (98%)
 rename src/TNL/Algorithms/Segments/{details => detail}/CheckLambdas.h (94%)
 rename src/TNL/Algorithms/Segments/{details => detail}/ChunkedEllpack.h (97%)
 rename src/TNL/Algorithms/Segments/{details => detail}/LambdaAdapter.h (96%)

diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index 9eb71956a..53a3eb905 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -301,7 +301,7 @@ verifyRowLengths( const SizesHolder& segmentsSizes )
       const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
       IndexType elementPtr = begin;
       IndexType rowLength = 0;
-      const IndexType groupsCount = details::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
+      const IndexType groupsCount = detail::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
       for( IndexType group = 0; group < groupsCount; group++ )
       {
          std::cerr << "groupIdx = " << group << " groupLength = " << this->getGroupLength( strip, group ) << std::endl;
@@ -386,7 +386,7 @@ template< typename Device,
 auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::BiEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+   return detail::BiEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
       rowPermArray.getConstView(),
       groupPointers.getConstView(),
       segmentIdx );
@@ -422,7 +422,7 @@ template< typename Device,
 __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
 {
-      return details::BiEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::BiEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowPermArray.getConstView(),
          groupPointers.getConstView(),
          segmentIdx,
@@ -588,7 +588,7 @@ template< typename Device,
 auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getStripLength( const IndexType stripIdx ) const -> IndexType
 {
-   return details::BiEllpack< Index, Device, Organization, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
+   return detail::BiEllpack< Index, Device, Organization, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 44629ea71..50f69e3aa 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -15,7 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -207,7 +207,7 @@ class BiEllpackView
                                              Args_... args );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
-      friend struct details::BiEllpackSegmentsReductionDispatcher;
+      friend struct detail::BiEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index 0fae90d07..03131a0de 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -13,8 +13,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/BiEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-//#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+//#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
 #include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
@@ -158,19 +158,19 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSize(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSize(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
@@ -206,13 +206,13 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
          rowPermArray,
          groupPointers,
          segmentIdx,
          localIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndex(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndex(
          rowPermArray,
          groupPointers,
          segmentIdx,
@@ -220,7 +220,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
          rowPermArray,
          groupPointers,
          segmentIdx,
@@ -239,19 +239,19 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentView(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentView(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
@@ -272,7 +272,7 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType strip = segmentIdx / getWarpSize();
       const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
       const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize();
-      const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
+      const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
       IndexType groupHeight = getWarpSize();
       //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
       bool compute( true );
@@ -357,7 +357,7 @@ void
 BiEllpackView< Device, Index, Organization, WarpSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( this->getStorageSize() == 0 )
       return;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -366,7 +366,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          const IndexType stripIdx = segmentIdx / getWarpSize();
          const IndexType groupIdx = stripIdx * ( getLogWarpSize() + 1 );
          const IndexType inStripIdx = rowPermArray[ segmentIdx ] - stripIdx * getWarpSize();
-         const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
+         const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
          IndexType globalIdx = groupPointers[ groupIdx ];
          IndexType groupHeight = getWarpSize();
          IndexType localIdx( 0 );
@@ -380,7 +380,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          //          << std::endl;
          for( IndexType group = 0; group < groupsCount && compute; group++ )
          {
-            const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
+            const IndexType groupSize = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
             IndexType groupWidth = groupSize / groupHeight;
             const IndexType globalIdxBack = globalIdx;
             //std::cerr << "  groupSize = " << groupSize
@@ -395,8 +395,8 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
                //std::cerr << "    segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx
                //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
                //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx
-               //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
-               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               //          << " fetch = " << detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
                if( Organization == RowMajorOrder )
                   globalIdx ++;
                else
@@ -425,7 +425,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+         detail::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
          cudaThreadSynchronize();
@@ -535,7 +535,7 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
    const IndexType strip = segmentIdx / getWarpSize();
    const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
    const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
-   const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+   const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
    IndexType groupHeight = getWarpSize();
    bool compute( true );
    IndexType localIdx( 0 );
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 823393c2f..44f9aa799 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSR.h>
-#include <TNL/Algorithms/Segments/details/CSR.h>
+#include <TNL/Algorithms/Segments/detail/CSR.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -91,7 +91,7 @@ void
 CSR< Device, Index, Kernel, IndexAllocator >::
 setSegmentsSizes( const SizesHolder& sizes )
 {
-   details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
+   detail::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
    this->kernel.init( this->offsets );
 }
 
@@ -148,7 +148,7 @@ template< typename Device,
 __cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
+   return detail::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
@@ -168,7 +168,7 @@ template< typename Device,
 __cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getStorageSize() const -> IndexType
 {
-   return details::CSR< Device, Index >::getStorageSize( this->offsets );
+   return detail::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 58710a883..640120f86 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -14,10 +14,10 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel
 
    static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };
 
-   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
+   static int getSizeValueLog( const int& i ) { return detail::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
 
    static TNL::String getKernelType();
 
@@ -98,7 +98,7 @@ struct CSRAdaptiveKernel
       Index findLimit( const Index start,
                      const Offsets& offsets,
                      const Index size,
-                     details::Type &type,
+                     detail::Type &type,
                      size_t &sum );
 
       template< int SizeOfValue,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
index d0217b57b..a510ac395 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
@@ -14,9 +14,9 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -121,7 +121,7 @@ CSRAdaptiveKernel< Index, Device >::
 findLimit( const Index start,
            const Offsets& offsets,
            const Index size,
-           details::Type &type,
+           detail::Type &type,
            size_t &sum )
 {
    sum = 0;
@@ -129,24 +129,24 @@ findLimit( const Index start,
    {
       Index elements = offsets[ current + 1 ] - offsets[ current ];
       sum += elements;
-      if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
+      if( sum > detail::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
       {
          if( current - start > 0 ) // extra row
          {
-            type = details::Type::STREAM;
+            type = detail::Type::STREAM;
             return current;
          }
          else
          {                  // one long row
-            if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
-               type = details::Type::VECTOR;
+            if( sum <= 2 * detail::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
+               type = detail::Type::VECTOR;
             else
-               type = details::Type::LONG;
+               type = detail::Type::LONG;
             return current + 1;
          }
       }
    }
-   type = details::Type::STREAM;
+   type = detail::Type::STREAM;
    return size - 1; // return last row pointer
 }
 
@@ -165,22 +165,22 @@ initValueSize( const Offsets& offsets )
    size_t sum;
 
    // Fill blocks
-   std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
+   std::vector< detail::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
    inBlocks.reserve( rows );
 
    while( nextStart != rows - 1 )
    {
-      details::Type type;
+      detail::Type type;
       nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum );
-      if( type == details::Type::LONG )
+      if( type == detail::Type::LONG )
       {
          const Index blocksCount = inBlocks.size();
-         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
+         const Index warpsPerCudaBlock = detail::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
          Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
          if( warpsLeft == 0 )
             warpsLeft = warpsPerCudaBlock;
          for( Index index = 0; index < warpsLeft; index++ )
-            inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
+            inBlocks.emplace_back( start, detail::Type::LONG, index, warpsLeft );
       }
       else
       {
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
index b81d36027..9de407051 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
@@ -11,8 +11,8 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -26,12 +26,12 @@ struct CSRAdaptiveKernelView
    using DeviceType = Device;
    using ViewType = CSRAdaptiveKernelView< Index, Device >;
    using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
-   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
+   using BlocksType = TNL::Containers::Vector< detail::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
    using BlocksView = typename BlocksType::ViewType;
 
-   static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;
+   static constexpr int MaxValueSizeLog = detail::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;
 
-   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
+   static int getSizeValueLog( const int& i ) { return detail::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
 
    CSRAdaptiveKernelView() = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 979a54524..4f1560857 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -14,11 +14,11 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -46,11 +46,11 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
-   using BlockType = details::CSRAdaptiveKernelBlockDescriptor< Index >;
-   constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+   using BlockType = detail::CSRAdaptiveKernelBlockDescriptor< Index >;
+   constexpr int CudaBlockSize = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
    constexpr int WarpSize = Cuda::getWarpSize();
-   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
-   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
+   constexpr int WarpsCount = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
+   constexpr size_t StreamedSharedElementsPerWarp  = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
    __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
@@ -74,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    const Index firstSegmentIdx = block.getFirstSegment();
    const Index begin = offsets[ firstSegmentIdx ];
 
-   if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
+   if( block.getType() == detail::Type::STREAM ) // Stream kernel - many short segments per warp
    {
       const Index warpIdx = threadIdx.x / 32;
       const Index end = begin + block.getSize();
@@ -94,7 +94,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          keep( i, result );
       }
    }
-   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( block.getType() == detail::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
@@ -181,7 +181,7 @@ template< typename Index,
           typename Reduction,
           typename ResultKeeper,
           bool DispatchScalarCSR =
-            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
             std::is_same< Device, Devices::Host >::value >
 struct CSRAdaptiveKernelSegmentsReductionDispatcher;
 
@@ -237,7 +237,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
 
       Index blocksCount;
 
-      const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+      const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
       constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
 
       // Fill blocks
@@ -333,7 +333,7 @@ segmentsReduction( const OffsetsView& offsets,
 {
    int valueSizeLog = getSizeValueLog( sizeof( Real ) );
 
-   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
+   if( detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
          segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
index 9a8109c97..d3e48be1e 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
index b4cc24a73..90505358e 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRHybridKernel.h>
 
 namespace TNL {
@@ -57,7 +57,7 @@ void segmentsReductionCSRHybridKernel(
     bool compute( true );
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
     {
-      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
       localIdx += TNL::Cuda::getWarpSize();
     }
 
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.h b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
index 8a56d75d1..c76708319 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
index 15f696679..dd05fee20 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -91,7 +91,7 @@ segmentsReduction( const OffsetsView& offsets,
         IndexType localIdx( 0 );
         bool compute( true );
         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
         keeper( segmentIdx, aux );
     };
 
@@ -109,7 +109,7 @@ segmentsReduction( const OffsetsView& offsets,
             IndexType localIdx( 0 );
             bool compute( true );
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-                aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+                aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             keeper( segmentIdx, aux );
         }*/
     }
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.h b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
index 3163abb60..074f15c5a 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
index 2caf272c1..847d1c355 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRVectorKernel.h>
 
 namespace TNL {
@@ -58,7 +58,7 @@ void segmentsReductionCSRKernelVector(
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
     {
         TNL_ASSERT_LT( globalIdx, endIdx, "" );
-        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+        aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
         localIdx += TNL::Cuda::getWarpSize();
     }
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index bb40dc9f6..8c9f1e789 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -13,8 +13,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
-#include <TNL/Algorithms/Segments/details/CSR.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/CSR.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -131,7 +131,7 @@ template< typename Device,
 __cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
+   return detail::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
@@ -149,7 +149,7 @@ template< typename Device,
 __cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getStorageSize() const -> IndexType
 {
-   return details::CSR< Device, Index >::getStorageSize( this->offsets );
+   return detail::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index b6bdd5bf1..5abb93b5a 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -36,7 +36,7 @@ class ChunkedEllpack
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
       using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, Organization >;
       using SegmentViewType = typename ViewType::SegmentViewType;
-      using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // detail::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoAllocator = typename ViewType::ChunkedEllpackSliceInfoAllocator; // typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = typename ViewType::ChunkedEllpackSliceInfoContainer; // Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index fd3df2053..b4f60047b 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -336,7 +336,7 @@ template< typename Device,
 auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+   return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
       rowToSliceMapping.getView(),
       slices.getView(),
       rowToChunkMapping.getView(),
@@ -370,7 +370,7 @@ template< typename Device,
 __cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 196c0764e..f7211c216 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -16,7 +16,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -39,7 +39,7 @@ class ChunkedEllpackView
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
       using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, Organization >;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
-      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
@@ -230,7 +230,7 @@ class ChunkedEllpackView
                                                   Args_... args );
 
       template< typename Index_, typename Fetch_, bool B_ >
-      friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
+      friend struct detail::ChunkedEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 2b827307d..26e8fd0f7 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -13,8 +13,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-//#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+//#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
 #include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
@@ -184,7 +184,7 @@ __cuda_callable__ auto ChunkedEllpackView< Device, Index, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -192,13 +192,13 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
          segmentIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -232,7 +232,7 @@ __cuda_callable__ auto ChunkedEllpackView< Device, Index, Organization >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -242,7 +242,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -250,7 +250,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
          segmentIdx,
          localIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -270,7 +270,7 @@ ChunkedEllpackView< Device, Index, Organization >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -279,14 +279,14 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
          chunksInSlice,
          segmentIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentView(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentView(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -398,7 +398,7 @@ void
 ChunkedEllpackView< Device, Index, Organization >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
       //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
@@ -426,7 +426,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             IndexType end = begin + segmentSize;
             IndexType localIdx( 0 );
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ )
-               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          }
          else
          {
@@ -436,7 +436,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
                IndexType end = begin + chunksInSlice * chunkSize;
                IndexType localIdx( 0 );
                for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += chunksInSlice )
-                  aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+                  aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             }
          }
          keeper( segmentIdx, aux );
@@ -456,7 +456,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         detail::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
@@ -567,7 +567,7 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
       return;
 
    RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   __shared__ detail::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
    if( threadIdx.x == 0 )
       sliceInfo = this->slices[ sliceIdx ];
    chunksResults[ threadIdx.x ] = zero;
@@ -645,7 +645,7 @@ segmentsReductionKernel( IndexType gridIdx,
       return;
 
    RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   __shared__ detail::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
 
    if( threadIdx.x == 0 )
       sliceInfo = this->slices[ sliceIdx ];
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 724774b53..6215f4ef9 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/EllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -276,7 +276,7 @@ void EllpackView< Device, Index, Organization, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
       const IndexType segmentSize = this->segmentSize;
@@ -287,7 +287,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -303,7 +303,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j += alignedSize  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 42fdae7ea..94bebca13 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 #include "SlicedEllpackView.h"
 
@@ -331,7 +331,7 @@ void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
@@ -347,7 +347,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -364,7 +364,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Algorithms/Segments/details/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
similarity index 98%
rename from src/TNL/Algorithms/Segments/details/BiEllpack.h
rename to src/TNL/Algorithms/Segments/detail/BiEllpack.h
index 29551eb1d..a45e16d77 100644
--- a/src/TNL/Algorithms/Segments/details/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -13,12 +13,12 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/CheckLambdas.h>
+#include <TNL/Algorithms/Segments/detail/CheckLambdas.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Device,
@@ -292,7 +292,7 @@ template< typename Index,
           typename Fetch,
           int BlockDim = 256,
           int WarpSize = 32,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct BiEllpackSegmentsReductionDispatcher{};
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
@@ -364,7 +364,7 @@ void BiEllpackSegmentsReductionKernel( View biEllpack,
 }
 #endif
 
-         } //namespace details
+         } //namespace detail
       } //namespace Segments
    } //namespace Algorithms
 } //namepsace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSR.h b/src/TNL/Algorithms/Segments/detail/CSR.h
similarity index 98%
rename from src/TNL/Algorithms/Segments/details/CSR.h
rename to src/TNL/Algorithms/Segments/detail/CSR.h
index 193758f70..e43a97b67 100644
--- a/src/TNL/Algorithms/Segments/details/CSR.h
+++ b/src/TNL/Algorithms/Segments/detail/CSR.h
@@ -15,7 +15,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Device,
           typename Index >
@@ -110,7 +110,7 @@ class CSR
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 };
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
similarity index 99%
rename from src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
rename to src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
index d2be89664..83faa105d 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
@@ -13,7 +13,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 enum class Type {
    /* LONG = 0!!! Non zero value rewrites index[1] */
@@ -245,7 +245,7 @@ std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescri
    block.print( str );
    return str;
 }
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
similarity index 98%
rename from src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
rename to src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
index 843f2f7d5..f11668c2d 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
@@ -13,7 +13,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 // This can be used for tunning the number of CUDA threads per block depending on the size of Value
 // TODO: Perform some tests
@@ -106,7 +106,7 @@ getSizeValueLogConstexpr( const int i )
    return 6;
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CheckLambdas.h b/src/TNL/Algorithms/Segments/detail/CheckLambdas.h
similarity index 94%
rename from src/TNL/Algorithms/Segments/details/CheckLambdas.h
rename to src/TNL/Algorithms/Segments/detail/CheckLambdas.h
index a9b6d672b..11944f948 100644
--- a/src/TNL/Algorithms/Segments/details/CheckLambdas.h
+++ b/src/TNL/Algorithms/Segments/detail/CheckLambdas.h
@@ -14,7 +14,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Lambda >
@@ -34,7 +34,7 @@ class CheckFetchLambda
       static constexpr bool hasAllParameters() { return value; };
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
similarity index 97%
rename from src/TNL/Algorithms/Segments/details/ChunkedEllpack.h
rename to src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 41e4ca415..5f47b0caf 100644
--- a/src/TNL/Algorithms/Segments/details/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -13,12 +13,12 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/CheckLambdas.h>
+#include <TNL/Algorithms/Segments/detail/CheckLambdas.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 /***
  * In the ChunkedEllpack, the segments are split into slices. This is done
@@ -65,7 +65,7 @@ class ChunkedEllpack
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
       using OffsetsHolderView = typename OffsetsHolder::ViewType;
       using SegmentsSizes = OffsetsHolder;
-      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
@@ -233,7 +233,7 @@ class ChunkedEllpack
 #ifdef HAVE_CUDA
 template< typename Index,
           typename Fetch,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct ChunkedEllpackSegmentsReductionDispatcher{};
 
 template< typename Index, typename Fetch >
@@ -304,7 +304,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
 }
 #endif
 
-         } //namespace details
+         } //namespace detail
       } //namespace Segments
    } //namespace Algorithms
 } //namepsace TNL
diff --git a/src/TNL/Algorithms/Segments/details/LambdaAdapter.h b/src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/details/LambdaAdapter.h
rename to src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
index e4d8871c5..a46acba8f 100644
--- a/src/TNL/Algorithms/Segments/details/LambdaAdapter.h
+++ b/src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
@@ -15,7 +15,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Lambda,
@@ -50,7 +50,7 @@ struct FetchLambdaAdapter< Index, Lambda, false >
    }
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
-- 
GitLab


From 4467323ab17fbd7ef0f31d4017d60296d2875840 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Jul 2021 07:58:30 +0200
Subject: [PATCH 21/52] Fixed bug in the second phase of CUDA scan
 implementation

---
 src/TNL/Algorithms/detail/CudaScanKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 81379ce59..8fb89beb2 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -348,7 +348,7 @@ struct CudaScanKernelLauncher
          // run the kernel
          cudaSecondPhaseBlockScan<<< cudaGridSize, cudaBlockSize >>>
             ( reduction,
-              size,
+              currentSize,
               elementsInBlock,
               gridIdx,
               (Index) maxGridSize(),
-- 
GitLab


From 4f1dc3af048c71ace3bc73dfdb6875b9cc62fe52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 13 Jul 2021 10:08:14 +0200
Subject: [PATCH 22/52] Refactored and extended tests for scan and distributed
 scan

---
 src/TNL/Algorithms/detail/CudaScanKernel.h    |   1 +
 src/TNL/Algorithms/detail/Scan.hpp            |  24 +
 src/TNL/Algorithms/scan.h                     |   4 +
 .../Algorithms/distributedScanTest.h          | 622 +++++++++++-------
 src/UnitTests/Algorithms/scanTest.h           | 606 ++++++++++-------
 5 files changed, 771 insertions(+), 486 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 8fb89beb2..fe3b26549 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -373,6 +373,7 @@ struct CudaScanKernelLauncher
    static void resetMaxGridSize()
    {
       maxGridSize() = Cuda::getMaxGridSize();
+      gridsCount() = -1;
    }
 
    static int& gridsCount()
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index 5e08bbf3b..ee5a4bc1f 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -107,6 +107,9 @@ perform( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin )
+      return;
+
    const IndexType size = end - begin;
    const int max_threads = Devices::Host::getMaxThreadsCount();
    const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
@@ -157,6 +160,12 @@ performFirstPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin ) {
+      Containers::Array< typename Vector::ValueType, Devices::Sequential > block_results( 1 );
+      block_results.setValue( zero );
+      return block_results;
+   }
+
    const IndexType size = end - begin;
    const int max_threads = Devices::Host::getMaxThreadsCount();
    const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
@@ -204,6 +213,9 @@ performSecondPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin )
+      return;
+
    const IndexType size = end - begin;
    const int max_threads = Devices::Host::getMaxThreadsCount();
    const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
@@ -241,6 +253,9 @@ perform( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin )
+      return;
+
    detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform(
       end - begin,
       &v.getData()[ begin ],  // input
@@ -267,6 +282,12 @@ performFirstPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin ) {
+      Containers::Array< typename Vector::ValueType, Devices::Cuda > block_results( 1 );
+      block_results.setValue( zero );
+      return block_results;
+   }
+
    return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase(
       end - begin,
       &v.getData()[ begin ],  // input
@@ -295,6 +316,9 @@ performSecondPhase( Vector& v,
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
 
+   if( end <= begin )
+      return;
+
    detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase(
       end - begin,
       &v.getData()[ begin ],  // output
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
index 7836cf231..0a6f11f5b 100644
--- a/src/TNL/Algorithms/scan.h
+++ b/src/TNL/Algorithms/scan.h
@@ -64,6 +64,8 @@ inplaceInclusiveScan( Array& array,
                       Reduction&& reduction,
                       typename Array::ValueType zero )
 {
+   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
+                  "zero is not an idempotent value of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive >;
    Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
 }
@@ -134,6 +136,8 @@ inplaceExclusiveScan( Array& array,
                       Reduction&& reduction,
                       typename Array::ValueType zero )
 {
+   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
+                  "zero is not an idempotent value of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive >;
    Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
 }
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index 17f498229..ab857001a 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -1,12 +1,10 @@
 #pragma once
 
 #ifdef HAVE_GTEST
-#include <limits>
-
 #include <gtest/gtest.h>
 
 #include <TNL/Containers/DistributedArray.h>
-#include <TNL/Containers/DistributedArrayView.h>
+#include <TNL/Containers/DistributedVectorView.h>
 #include <TNL/Containers/Partitioner.h>
 #include <TNL/Algorithms/distributedScan.h>
 
@@ -16,6 +14,7 @@
 using namespace TNL;
 using namespace TNL::Containers;
 using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::detail;
 using namespace TNL::MPI;
 
 /*
@@ -34,49 +33,91 @@ protected:
    using DeviceType = typename DistributedArray::DeviceType;
    using IndexType = typename DistributedArray::IndexType;
    using DistributedArrayType = DistributedArray;
-   using VectorViewType = typename DistributedArrayType::LocalViewType;
    using DistributedArrayView = Containers::DistributedArrayView< ValueType, DeviceType, IndexType >;
+   using DistributedVectorView = Containers::DistributedVectorView< ValueType, DeviceType, IndexType >;
    using HostDistributedArrayType = typename DistributedArrayType::template Self< ValueType, Devices::Sequential >;
+   using LocalRangeType = typename DistributedArray::LocalRangeType;
+   using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
+   using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
 
    const MPI_Comm group = AllGroup();
 
-   DistributedArrayType v;
-   DistributedArrayView v_view;
-   HostDistributedArrayType v_host;
+   DistributedArrayType a, b, c;
+   DistributedArrayView a_view, b_view, c_view;
+   DistributedVectorView av_view, bv_view, cv_view;
+   HostDistributedArrayType array_host, input_host, expected_host;
 
    const int rank = GetRank(group);
    const int nproc = GetSize(group);
 
    // should be small enough to have fast tests, but large enough to test
    // scan with multiple CUDA grids
-   const int globalSize = 10000 * nproc;
+   // also should be a prime number to cause non-uniform distribution of the work
+   const int globalSize = 9377 * nproc;
+
+   LocalRangeType localRange;
 
    // some arbitrary value (but must be 0 if not distributed)
    const int ghosts = (nproc > 1) ? 4 : 0;
 
    DistributedScanTest()
    {
-      using LocalRangeType = typename DistributedArray::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
-      v.setDistribution( localRange, ghosts, globalSize, group );
-
-      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
-      using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
-      v.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
-      v_view.setSynchronizer( v.getSynchronizer() );
-      v_host.setSynchronizer( std::make_shared<HostSynchronizer>( localRange, ghosts / 2, group ) );
-
-      v_view.bind( v );
-      setConstantSequence( v, 1 );
+      resetWorkingArrays();
+      input_host = a;
+      input_host.setSynchronizer( std::make_shared<HostSynchronizer>( a.getLocalRange(), ghosts / 2, group ) );
+      expected_host = input_host;
+   }
+
+   void resetWorkingArrays()
+   {
+      localRange = Partitioner< IndexType >::splitRange( globalSize, group );
+      a.setDistribution( localRange, ghosts, globalSize, group );
+      a.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
+
+      a.setValue( -1 );
+      c = b = a;
+      a_view.bind( a );
+      b_view.bind( b );
+      c_view.bind( c );
+      av_view.bind( a );
+      bv_view.bind( b );
+      cv_view.bind( c );
+
+      // make sure that we perform tests with multiple CUDA grids
+#ifdef HAVE_CUDA
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+      }
+#endif
+   }
+
+   template< Algorithms::detail::ScanType ScanType >
+   void checkResult( const DistributedArrayType& array, bool check_cuda_grids = true )
+   {
+#ifdef HAVE_CUDA
+      // skip the check for too small arrays
+      if( check_cuda_grids && array.getLocalRange().getSize() > 256 )
+         EXPECT_GT( ( CudaScanKernelLauncher< ScanType, ValueType, IndexType >::gridsCount() ), 1 );
+#endif
+
+      array_host = array;
+
+      for( int i = a.getLocalRange().getBegin(); i < a.getLocalRange().getEnd(); i++ )
+         EXPECT_EQ( array_host[ i ], expected_host[ i ] ) << "arrays differ at index i = " << i;
    }
 };
 
 // types for which DistributedScanTest is instantiated
 using DistributedArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
    DistributedArray< double, Devices::Sequential, int >,
    DistributedArray< double, Devices::Host, int >
+#endif
 #ifdef HAVE_CUDA
-   ,
    DistributedArray< double, Devices::Cuda, int >
 #endif
 >;
@@ -85,238 +126,325 @@ TYPED_TEST_SUITE( DistributedScanTest, DistributedArrayTypes );
 
 // TODO: test that horizontal operations are computed for ghost values without synchronization
 
-TYPED_TEST( DistributedScanTest, inclusiveScan )
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_zero_array )
 {
-   using ValueType = typename TestFixture::DistributedArrayType::ValueType;
-   using DeviceType = typename TestFixture::DistributedArrayType::DeviceType;
-   using IndexType = typename TestFixture::DistributedArrayType::IndexType;
-
-   auto& v = this->v;
-   auto& v_view = this->v_view;
-   auto& v_host = this->v_host;
-   const auto localRange = v.getLocalRange();
-
-   setConstantSequence( v, 0 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
-
-      setLinearSequence( v );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
-
-      setLinearSequence( v );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceInclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
-#endif
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
    }
+
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
-TYPED_TEST( DistributedScanTest, exclusiveScan )
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_zero_array )
 {
-   using ValueType = typename TestFixture::DistributedArrayType::ValueType;
-   using DeviceType = typename TestFixture::DistributedArrayType::DeviceType;
-   using IndexType = typename TestFixture::DistributedArrayType::IndexType;
-
-   auto& v = this->v;
-   auto& v_view = this->v_view;
-   auto& v_host = this->v_host;
-   const auto localRange = v.getLocalRange();
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< ValueType, float >::value )
-      return;
-
-   setConstantSequence( v, 0 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host.setValue( -1 );
-   Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i );
-
-      setLinearSequence( v );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i );
-
-      setLinearSequence( v );
-      v_host.setValue( -1 );
-      Algorithms::distributedInplaceExclusiveScan( v_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
-#endif
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+
+TYPED_TEST( DistributedScanTest, inplace_multiplication )
+{
+   this->localRange = Partitioner< typename TestFixture::IndexType >::splitRange( 10, this->group );
+   this->input_host.setDistribution( this->localRange, 0, 10, this->group );
+   this->input_host.setValue( 2 );
+   this->expected_host = this->input_host;
+
+   // exclusive scan test
+   int value = 1;
+   for( int i = 0; i < this->localRange.getEnd(); i++ ) {
+      if( this->localRange.getBegin() <= i )
+         this->expected_host[ i ] = value;
+      value *= 2;
    }
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] *= 2;
+
+   this->a.reset();
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, inplace_custom_begin_end )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   // make it span multiple processes
+   const IndexType begin = 42;
+   const IndexType end = (this->nproc > 1) ? this->globalSize / this->nproc + begin : this->globalSize - begin;
+
+   // exclusive scan test
+   this->input_host.setValue( 1 );
+   this->expected_host.setValue( 1 );
+   int value = 0;
+   for( int i = begin; i < end; i++ ) {
+      if( this->localRange.getBegin() <= i && i < this->localRange.getEnd() )
+         this->expected_host[ i ] = value;
+      value++;
+   }
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a, false );
+
+   // inclusive scan test
+   for( int i = begin; i < end; i++ )
+      if( this->localRange.getBegin() <= i && i < this->localRange.getEnd() )
+         this->expected_host[ i ]++;
+
+   this->a.reset();
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a, false );
+}
+
+TYPED_TEST( DistributedScanTest, inplace_empty_range )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   this->localRange = Partitioner< typename TestFixture::IndexType >::splitRange( 42, this->group );
+   this->input_host.setDistribution( this->localRange, 0, 42, this->group );
+   this->input_host.setValue( 1 );
+   this->expected_host = this->input_host;
+
+   const IndexType begin = 2;
+   const IndexType end = 1;
+
+   // exclusive scan test
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a, false );
+
+   // inclusive scan test
+   this->a.reset();
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a, false );
 }
 
 #endif  // HAVE_GTEST
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index c0e8f7118..1f7a87837 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -1,23 +1,18 @@
 #pragma once
 
 #ifdef HAVE_GTEST
+#include <gtest/gtest.h>
 
 #include <TNL/Arithmetics/Quad.h>
 #include <TNL/Containers/Array.h>
+#include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/scan.h>
 
-#include "../Containers/VectorHelperFunctions.h"
-
-#include "gtest/gtest.h"
-
 using namespace TNL;
 using namespace TNL::Containers;
 using namespace TNL::Arithmetics;
 using namespace TNL::Algorithms;
-
-// should be small enough to have fast tests, but larger than minGPUReductionDataSize
-// and large enough to require multiple CUDA blocks for reduction
-constexpr int ARRAY_TEST_SIZE = 10000;
+using namespace TNL::Algorithms::detail;
 
 // test fixture for typed tests
 template< typename Array >
@@ -25,7 +20,67 @@ class ScanTest : public ::testing::Test
 {
 protected:
    using ArrayType = Array;
-   using ViewType = ArrayView< typename Array::ValueType, typename Array::DeviceType, typename Array::IndexType >;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using ArrayView = Containers::ArrayView< ValueType, DeviceType, IndexType >;
+   using VectorView = Containers::VectorView< ValueType, DeviceType, IndexType >;
+   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
+
+   ArrayType a, b, c;
+   ArrayView a_view, b_view, c_view;
+   VectorView av_view, bv_view, cv_view;
+   HostArrayType array_host, input_host, expected_host;
+
+   // should be small enough to have fast tests, but larger than minGPUReductionDataSize
+   // and large enough to require multiple CUDA blocks for reduction
+   // also should be a prime number to cause non-uniform distribution of the work
+   const int size = 9377;
+
+   ScanTest()
+   {
+      resetWorkingArrays();
+      input_host = expected_host = a;
+   }
+
+   void resetWorkingArrays()
+   {
+      a.setSize( size );
+      a.setValue( -1 );
+      c = b = a;
+      a_view.bind( a );
+      b_view.bind( b );
+      c_view.bind( c );
+      av_view.bind( a );
+      bv_view.bind( b );
+      cv_view.bind( c );
+
+      // make sure that we perform tests with multiple CUDA grids
+#ifdef HAVE_CUDA
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+      }
+#endif
+   }
+
+   template< Algorithms::detail::ScanType ScanType >
+   void checkResult( const ArrayType& array )
+   {
+#ifdef HAVE_CUDA
+      // skip the check for too small arrays
+      if( array.getSize() > 256 )
+         EXPECT_GT( ( CudaScanKernelLauncher< ScanType, ValueType, IndexType >::gridsCount() ), 1 );
+#endif
+
+      array_host = array;
+
+      for( int i = 0; i < array.getSize(); i++ )
+         EXPECT_EQ( array_host[ i ], expected_host[ i ] ) << "arrays differ at index i = " << i;
+   }
 };
 
 // types for which ScanTest is instantiated
@@ -85,244 +140,317 @@ using ArrayTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( ScanTest, ArrayTypes );
 
-TYPED_TEST( ScanTest, inclusiveScan )
+TYPED_TEST( ScanTest, inplaceInclusiveScan_zero_array )
 {
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-   using ValueType = typename ArrayType::ValueType;
-   using DeviceType = typename ArrayType::DeviceType;
-   using IndexType = typename ArrayType::IndexType;
-   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
-   const int size = ARRAY_TEST_SIZE;
-
-   ArrayType v( size );
-   ViewType v_view( v );
-   HostArrayType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      inplaceInclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      inplaceInclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
-#endif
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplaceInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplaceInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
    }
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
 }
 
-TYPED_TEST( ScanTest, exclusiveScan )
+
+TYPED_TEST( ScanTest, inplace_multiplication )
 {
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-   using ValueType = typename ArrayType::ValueType;
-   using DeviceType = typename ArrayType::DeviceType;
-   using IndexType = typename ArrayType::IndexType;
-   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
-   const int size = ARRAY_TEST_SIZE;
-
-   ArrayType v;
-   v.setSize( size );
-   ViewType v_view( v );
-   HostArrayType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      inplaceExclusiveScan( v, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      inplaceExclusiveScan( v_view, 0, size, std::plus<>{}, (ValueType) 0 );
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::detail::ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
-#endif
+   this->input_host.setSize( 10 );
+   this->input_host.setValue( 2 );
+   this->expected_host = this->input_host;
+
+   // exclusive scan test
+   int value = 1;
+   for( int i = 0; i < this->expected_host.getSize(); i++ ) {
+      this->expected_host[ i ] = value;
+      value *= 2;
    }
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = 0; i < this->expected_host.getSize(); i++ )
+      this->expected_host[ i ] *= 2;
+
+   this->a.reset();
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inplace_custom_begin_end )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   const IndexType begin = 42;
+   const IndexType end = this->size - begin;
+
+   // exclusive scan test
+   this->input_host.setValue( 1 );
+   this->expected_host.setValue( 1 );
+   for( int i = begin; i < end; i++ )
+      this->expected_host[ i ] = i - begin;
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = begin; i < end; i++ )
+      this->expected_host[ i ]++;
+
+   this->a.reset();
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
-// TODO: test scan with custom begin and end parameters
+TYPED_TEST( ScanTest, inplace_empty_range )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   this->input_host.setSize( 42 );
+   this->input_host.setValue( 1 );
+   this->expected_host = this->input_host;
+
+   const IndexType begin = 2;
+   const IndexType end = 1;
+
+   // exclusive scan test
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   this->a.reset();
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
 
 #endif // HAVE_GTEST
 
-- 
GitLab


From 76a95d0b12bf5c9e01868147969e6c05b3bbdac5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Jul 2021 19:10:48 +0200
Subject: [PATCH 23/52] Implemented 'outplace' variants of scan and distributed
 scan functions

---
 .../Tutorials/ReductionAndScan/CMakeLists.txt |   2 +
 .../ReductionAndScan/exclusiveScanExample.cpp |  31 ++
 .../ReductionAndScan/exclusiveScanExample.cu  |   1 +
 .../ReductionAndScan/inclusiveScanExample.cpp |  31 ++
 .../ReductionAndScan/inclusiveScanExample.cu  |   1 +
 src/TNL/Algorithms/detail/CudaScanKernel.h    |  26 +-
 src/TNL/Algorithms/detail/DistributedScan.h   |  29 +-
 src/TNL/Algorithms/detail/Scan.h              | 147 ++++----
 src/TNL/Algorithms/detail/Scan.hpp            | 228 ++++++------
 src/TNL/Algorithms/distributedScan.h          | 150 +++++++-
 src/TNL/Algorithms/scan.h                     | 176 +++++++++-
 .../Algorithms/distributedScanTest.h          | 302 +++++++++++++++-
 src/UnitTests/Algorithms/scanTest.h           | 329 +++++++++++++++++-
 13 files changed, 1241 insertions(+), 212 deletions(-)
 create mode 100644 Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu
 create mode 100644 Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu

diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 85e762868..b88328a41 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -12,6 +12,8 @@ set( COMMON_EXAMPLES
      MapReduceExample-3
      ReductionWithArgument
      ReductionWithArgumentWithFunctional
+     inclusiveScanExample
+     exclusiveScanExample
      inplaceInclusiveScanExample
      inplaceExclusiveScanExample
      SegmentedScanExample
diff --git a/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp
new file mode 100644
index 000000000..6cd787990
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
+   host_input = 1.0;
+   std::cout << "host_input = " << host_input << std::endl;
+   exclusiveScan( host_input, host_output );
+   std::cout << "host_output " << host_output << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
+   cuda_input = 1.0;
+   std::cout << "cuda_input = " << cuda_input << std::endl;
+   exclusiveScan( cuda_input, cuda_output );
+   std::cout << "cuda_output " << cuda_output << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu
new file mode 120000
index 000000000..a0f42394f
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu
@@ -0,0 +1 @@
+exclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp
new file mode 100644
index 000000000..33737897d
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
+   host_input = 1.0;
+   std::cout << "host_input = " << host_input << std::endl;
+   inclusiveScan( host_input, host_output );
+   std::cout << "host_output " << host_output << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
+   cuda_input = 1.0;
+   std::cout << "cuda_input = " << cuda_input << std::endl;
+   inclusiveScan( cuda_input, cuda_output );
+   std::cout << "cuda_output " << cuda_output << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu
new file mode 120000
index 000000000..b192a3348
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu
@@ -0,0 +1 @@
+inclusiveScanExample.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index fe3b26549..3f2848d3a 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -177,9 +177,7 @@ cudaSecondPhaseBlockScan( Reduction reduction,
    }
 }
 
-template< ScanType scanType,
-          typename Real,
-          typename Index >
+template< ScanType scanType >
 struct CudaScanKernelLauncher
 {
    /****
@@ -194,12 +192,14 @@ struct CudaScanKernelLauncher
     *              `reduction(zero, x) == x` for any `x`.
     * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction >
+   template< typename Reduction,
+             typename Real,
+             typename Index >
    static void
    perform( const Index size,
             const Real* deviceInput,
             Real* deviceOutput,
-            Reduction& reduction,
+            Reduction&& reduction,
             const Real zero,
             const int blockSize = 256 )
    {
@@ -231,12 +231,14 @@ struct CudaScanKernelLauncher
     *              `reduction(zero, x) == x` for any `x`.
     * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction >
+   template< typename Reduction,
+             typename Real,
+             typename Index >
    static auto
    performFirstPhase( const Index size,
                       const Real* deviceInput,
                       Real* deviceOutput,
-                      Reduction& reduction,
+                      Reduction&& reduction,
                       const Real zero,
                       const int blockSize = 256 )
    {
@@ -288,7 +290,7 @@ struct CudaScanKernelLauncher
       if( numberOfBlocks > 1 ) {
          // we perform an inclusive scan, but the 0-th is zero and block results
          // were shifted by 1, so effectively we get an exclusive scan
-         CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
+         CudaScanKernelLauncher< ScanType::Inclusive >::perform(
             blockResults.getSize(),
             blockResults.getData(),
             blockResults.getData(),
@@ -318,14 +320,16 @@ struct CudaScanKernelLauncher
     *               the neutral value).
     * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction >
+   template< typename Reduction,
+             typename Real,
+             typename Index >
    static void
    performSecondPhase( const Index size,
                        Real* deviceOutput,
                        const Real* blockShifts,
-                       Reduction& reduction,
+                       Reduction&& reduction,
                        const Real shift,
-                       const Index blockSize = 256 )
+                       const int blockSize = 256 )
    {
       // compute the number of grids
       const int elementsInBlock = 8 * blockSize;
diff --git a/src/TNL/Algorithms/detail/DistributedScan.h b/src/TNL/Algorithms/detail/DistributedScan.h
index 1046f3df6..db27948f2 100644
--- a/src/TNL/Algorithms/detail/DistributedScan.h
+++ b/src/TNL/Algorithms/detail/DistributedScan.h
@@ -24,28 +24,31 @@ namespace detail {
 template< ScanType Type >
 struct DistributedScan
 {
-   template< typename DistributedArray,
+   template< typename InputDistributedArray,
+             typename OutputDistributedArray,
              typename Reduction >
    static void
-   perform( DistributedArray& v,
-            typename DistributedArray::IndexType begin,
-            typename DistributedArray::IndexType end,
+   perform( const InputDistributedArray& input,
+            OutputDistributedArray& output,
+            typename InputDistributedArray::IndexType begin,
+            typename InputDistributedArray::IndexType end,
             Reduction&& reduction,
-            typename DistributedArray::ValueType zero )
+            typename OutputDistributedArray::ValueType zero )
    {
-      using ValueType = typename DistributedArray::ValueType;
-      using DeviceType = typename DistributedArray::DeviceType;
+      using ValueType = typename OutputDistributedArray::ValueType;
+      using DeviceType = typename OutputDistributedArray::DeviceType;
 
-      const auto group = v.getCommunicationGroup();
+      const auto group = input.getCommunicationGroup();
       if( group != MPI::NullGroup() ) {
          // adjust begin and end for the local range
-         const auto localRange = v.getLocalRange();
+         const auto localRange = input.getLocalRange();
          begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
          end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin();
 
          // perform first phase on the local data
-         auto localView = v.getLocalView();
-         const auto block_results = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
+         const auto inputLocalView = input.getConstLocalView();
+         auto outputLocalView = output.getLocalView();
+         const auto block_results = Scan< DeviceType, Type >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, zero );
          const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );
 
          // exchange local results between ranks
@@ -57,11 +60,11 @@ struct DistributedScan
          MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
 
          // compute the scan of the per-rank results
-         Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, 0, nproc, reduction, zero );
+         Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, rank_results, 0, nproc, 0, reduction, zero );
 
          // perform the second phase, using the per-block and per-rank results
          const int rank = MPI::GetRank( group );
-         Scan< DeviceType, Type >::performSecondPhase( localView, block_results, begin, end, reduction, rank_results[ rank ] );
+         Scan< DeviceType, Type >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, rank_results[ rank ] );
       }
    }
 };
diff --git a/src/TNL/Algorithms/detail/Scan.h b/src/TNL/Algorithms/detail/Scan.h
index 5fee86b60..872dcd79a 100644
--- a/src/TNL/Algorithms/detail/Scan.h
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -27,100 +27,127 @@ struct Scan;
 template< ScanType Type >
 struct Scan< Devices::Sequential, Type >
 {
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
    static void
-   performSecondPhase( Vector& v,
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
                        const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::ValueType zero );
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType zero );
 };
 
 template< ScanType Type >
 struct Scan< Devices::Host, Type >
 {
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
    static void
-   performSecondPhase( Vector& v,
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
                        const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::ValueType zero );
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType zero );
 };
 
 template< ScanType Type >
 struct Scan< Devices::Cuda, Type >
 {
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
    static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::ValueType zero );
-
-   template< typename Vector,
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType zero );
+
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
    static void
-   performSecondPhase( Vector& v,
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
                        const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::ValueType zero );
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType zero );
 };
 
 } // namespace detail
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index ee5a4bc1f..d982f1731 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -26,86 +26,95 @@ namespace Algorithms {
 namespace detail {
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 void
 Scan< Devices::Sequential, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType zero )
 {
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
+   using ValueType = typename OutputArray::ValueType;
 
    // simple sequential algorithm - not split into phases
    ValueType aux = zero;
    if( Type == ScanType::Inclusive ) {
-      for( IndexType i = begin; i < end; i++ )
-         v[ i ] = aux = reduction( aux, v[ i ] );
+      for( ; begin < end; begin++, outputBegin++ )
+         output[ outputBegin ] = aux = reduction( aux, input[ begin ] );
    }
    else // Exclusive scan
    {
-      for( IndexType i = begin; i < end; i++ ) {
-         const ValueType x = v[ i ];
-         v[ i ] = aux;
+      for( ; begin < end; begin++, outputBegin++ ) {
+         const ValueType x = input[ begin ];
+         output[ outputBegin ] = aux;
          aux = reduction( aux, x );
       }
    }
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 auto
 Scan< Devices::Sequential, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::ValueType zero )
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType zero )
 {
-   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
-//   Containers::StaticArray< 2, ValueType > block_results;
-   Containers::Array< typename Vector::ValueType, Devices::Sequential > block_results( 2 );
    // artificial first phase - only reduce the block
+   Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
    block_results[ 0 ] = zero;
-   block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, v, reduction, zero );
+   block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, zero );
    return block_results;
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
 Scan< Devices::Sequential, Type >::
-performSecondPhase( Vector& v,
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
                     const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::ValueType zero )
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType zero )
 {
    // artificial second phase - only one block, use the shift as the initial value
-   perform( v, begin, end, reduction, reduction( zero, blockShifts[ 0 ] ) );
+   perform( input, output, begin, end, outputBegin, reduction, reduction( zero, blockShifts[ 0 ] ) );
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 void
 Scan< Devices::Host, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_OPENMP
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
+   using ValueType = typename OutputArray::ValueType;
+   using IndexType = typename InputArray::IndexType;
 
    if( end <= begin )
       return;
@@ -121,47 +130,52 @@ perform( Vector& v,
 
       #pragma omp parallel num_threads(threads)
       {
-         const IndexType block_idx = omp_get_thread_num();
-         const IndexType block_begin = begin + block_idx * block_size;
+         const int block_idx = omp_get_thread_num();
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
          const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
 
          // step 1: per-block reductions, write the result into the buffer
-         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, v, reduction, zero );
+         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
 
          #pragma omp barrier
 
          // step 2: scan the block results
          #pragma omp single
          {
-            Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, blocks + 1, reduction, zero );
+            Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
          }
 
          // step 3: per-block scan using the block results as initial values
-         Scan< Devices::Sequential, Type >::perform( v, block_begin, block_end, reduction, block_results[ block_idx ] );
+         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_results[ block_idx ] );
       }
    }
    else
 #endif
-      Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
+      Scan< Devices::Sequential, Type >::perform( input, output, begin, end, outputBegin, reduction, zero );
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 auto
 Scan< Devices::Host, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::ValueType zero )
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_OPENMP
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
+   using ValueType = typename OutputArray::ValueType;
+   using IndexType = typename InputArray::IndexType;
 
    if( end <= begin ) {
-      Containers::Array< typename Vector::ValueType, Devices::Sequential > block_results( 1 );
+      Containers::Array< ValueType, Devices::Sequential > block_results( 1 );
       block_results.setValue( zero );
       return block_results;
    }
@@ -177,41 +191,43 @@ performFirstPhase( Vector& v,
 
       #pragma omp parallel num_threads(threads)
       {
-         const IndexType block_idx = omp_get_thread_num();
+         const int block_idx = omp_get_thread_num();
          const IndexType block_begin = begin + block_idx * block_size;
          const IndexType block_end = TNL::min( block_begin + block_size, end );
 
          // step 1: per-block reductions, write the result into the buffer
-         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, v, reduction, zero );
+         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
       }
 
       // step 2: scan the block results
-      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, 0, blocks + 1, reduction, zero );
+      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
 
       // block_results now contains shift values for each block - to be used in the second phase
       return block_results;
    }
    else
 #endif
-      return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
+      return Scan< Devices::Sequential, Type >::performFirstPhase( input, output, begin, end, outputBegin, reduction, zero );
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
 Scan< Devices::Host, Type >::
-performSecondPhase( Vector& v,
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
                     const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::ValueType zero )
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_OPENMP
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
+   using IndexType = typename InputArray::IndexType;
 
    if( end <= begin )
       return;
@@ -225,41 +241,43 @@ performSecondPhase( Vector& v,
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
       #pragma omp parallel num_threads(threads)
       {
-         const IndexType block_idx = omp_get_thread_num();
-         const IndexType block_begin = begin + block_idx * block_size;
+         const int block_idx = omp_get_thread_num();
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
          const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
 
          // phase 2: per-block scan using the block results as initial values
-         Scan< Devices::Sequential, Type >::perform( v, block_begin, block_end, reduction, reduction( zero, blockShifts[ block_idx ] ) );
+         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, reduction( zero, blockShifts[ block_idx ] ) );
       }
    }
    else
 #endif
-      Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, zero );
+      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, zero );
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 void
 Scan< Devices::Cuda, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::ValueType zero )
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
-
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::perform(
+   detail::CudaScanKernelLauncher< Type >::perform(
       end - begin,
-      &v.getData()[ begin ],  // input
-      &v.getData()[ begin ],  // output
+      &input.getData()[ begin ],
+      &output.getData()[ outputBegin ],
       reduction,
       zero );
 #else
@@ -268,30 +286,30 @@ perform( Vector& v,
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename Reduction >
 auto
 Scan< Devices::Cuda, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::ValueType zero )
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
-
    if( end <= begin ) {
-      Containers::Array< typename Vector::ValueType, Devices::Cuda > block_results( 1 );
+      Containers::Array< typename OutputArray::ValueType, Devices::Cuda > block_results( 1 );
       block_results.setValue( zero );
       return block_results;
    }
 
-   return detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type >::performFirstPhase(
       end - begin,
-      &v.getData()[ begin ],  // input
-      &v.getData()[ begin ],  // output
+      &input.getData()[ begin ],
+      &output.getData()[ outputBegin ],
       reduction,
       zero );
 #else
@@ -300,28 +318,28 @@ performFirstPhase( Vector& v,
 }
 
 template< ScanType Type >
-   template< typename Vector,
+   template< typename InputArray,
+             typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
 Scan< Devices::Cuda, Type >::
-performSecondPhase( Vector& v,
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
                     const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::ValueType zero )
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType zero )
 {
 #ifdef HAVE_CUDA
-   using ValueType = typename Vector::ValueType;
-   using IndexType = typename Vector::IndexType;
-
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type, ValueType, IndexType >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type >::performSecondPhase(
       end - begin,
-      &v.getData()[ begin ],  // output
+      &output.getData()[ outputBegin ],
       blockShifts.getData(),
       reduction,
       zero );
diff --git a/src/TNL/Algorithms/distributedScan.h b/src/TNL/Algorithms/distributedScan.h
index afabb8820..e8f001f85 100644
--- a/src/TNL/Algorithms/distributedScan.h
+++ b/src/TNL/Algorithms/distributedScan.h
@@ -20,6 +20,152 @@
 namespace TNL {
 namespace Algorithms {
 
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction >
+void
+distributedInclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin,
+                          typename InputDistributedArray::IndexType end,
+                          Reduction&& reduction,
+                          typename OutputDistributedArray::ValueType zero )
+{
+   static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( input.getCommunicationGroup(), output.getCommunicationGroup(),
+                  "The input and output arrays must have the same MPI communicator." );
+   TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
+                  "The input and output arrays must have the same local range on all ranks." );
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive >;
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   output.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename OutputDistributedArray::ValueType >()`.
+ * See \ref distributedInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin = 0,
+                          typename InputDistributedArray::IndexType end = 0,
+                          Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputDistributedArray::ValueType zero = Reduction::template getIdempotent< typename OutputDistributedArray::ValueType >();
+   distributedInclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction >
+void
+distributedExclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin,
+                          typename InputDistributedArray::IndexType end,
+                          Reduction&& reduction,
+                          typename OutputDistributedArray::ValueType zero )
+{
+   static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( input.getCommunicationGroup(), output.getCommunicationGroup(),
+                  "The input and output arrays must have the same MPI communicator." );
+   TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
+                  "The input and output arrays must have the same local range on all ranks." );
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive >;
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   output.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename OutputDistributedArray::ValueType >()`.
+ * See \ref distributedExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedExclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin = 0,
+                          typename InputDistributedArray::IndexType end = 0,
+                          Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputDistributedArray::ValueType zero = Reduction::template getIdempotent< typename OutputDistributedArray::ValueType >();
+   distributedExclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+}
+
 /**
  * \brief Computes an inclusive scan (or prefix sum) of a distributed array in-place.
  *
@@ -57,7 +203,7 @@ distributedInplaceInclusiveScan( DistributedArray& array,
                                  typename DistributedArray::ValueType zero )
 {
    using Scan = detail::DistributedScan< detail::ScanType::Inclusive >;
-   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
    array.startSynchronization();
 }
 
@@ -120,7 +266,7 @@ distributedInplaceExclusiveScan( DistributedArray& array,
                                  typename DistributedArray::ValueType zero )
 {
    using Scan = detail::DistributedScan< detail::ScanType::Exclusive >;
-   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
    array.startSynchronization();
 }
 
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
index 0a6f11f5b..302f7f844 100644
--- a/src/TNL/Algorithms/scan.h
+++ b/src/TNL/Algorithms/scan.h
@@ -20,6 +20,178 @@
 namespace TNL {
 namespace Algorithms {
 
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of an input array and
+ *        stores it in an output array.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam InputArray type of the array to be scanned
+ * \tparam OutputArray type of the output array
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param input the input array to be scanned
+ * \param output the array where the result will be stored
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param outputBegin the first element in the output array to be written. There
+ *                    must be at least `end - begin` elements in the output
+ *                    array starting at the position given by `outputBegin`.
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e.
+ *             element which does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inclusiveScanExample.out
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction >
+void
+inclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin,
+               typename InputArray::IndexType end,
+               typename OutputArray::IndexType outputBegin,
+               Reduction&& reduction,
+               typename OutputArray::ValueType zero )
+{
+   static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
+                  "zero is not an idempotent value of the reduction operation" );
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Inclusive >;
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Overload of \ref inclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename OutputArray::ValueType >()`.
+ * See \ref inclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction = TNL::Plus >
+void
+inclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin = 0,
+               typename InputArray::IndexType end = 0,
+               typename OutputArray::IndexType outputBegin = 0,
+               Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputArray::ValueType zero = Reduction::template getIdempotent< typename OutputArray::ValueType >();
+   inclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of an input array and
+ *        stores it in an output array.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam InputArray type of the array to be scanned
+ * \tparam OutputArray type of the output array
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param input the input array to be scanned
+ * \param output the array where the result will be stored
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param outputBegin the first element in the output array to be written. There
+ *                    must be at least `end - begin` elements in the output
+ *                    array starting at the position given by `outputBegin`.
+ * \param reduction functor implementing the reduction operation
+ * \param zero is the idempotent element for the reduction operation, i.e.
+ *             element which does not change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/exclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include exclusiveScanExample.out
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction >
+void
+exclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin,
+               typename InputArray::IndexType end,
+               typename OutputArray::IndexType outputBegin,
+               Reduction&& reduction,
+               typename OutputArray::ValueType zero )
+{
+   static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
+                  "zero is not an idempotent value of the reduction operation" );
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Exclusive >;
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+}
+
+/**
+ * \brief Overload of \ref exclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The idempotent value is taken as `reduction.template getIdempotent< typename OutputArray::ValueType >()`.
+ * See \ref exclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction = TNL::Plus >
+void
+exclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin = 0,
+               typename InputArray::IndexType end = 0,
+               typename OutputArray::IndexType outputBegin = 0,
+               Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputArray::ValueType zero = Reduction::template getIdempotent< typename OutputArray::ValueType >();
+   exclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+}
+
 /**
  * \brief Computes an inclusive scan (or prefix sum) of an array in-place.
  *
@@ -67,7 +239,7 @@ inplaceInclusiveScan( Array& array,
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive >;
-   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
 }
 
 /**
@@ -139,7 +311,7 @@ inplaceExclusiveScan( Array& array,
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive >;
-   Scan::perform( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
 }
 
 /**
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index ab857001a..757c82477 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -87,10 +87,10 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive >::maxGridSize() = 3;
       }
 #endif
    }
@@ -101,7 +101,7 @@ protected:
 #ifdef HAVE_CUDA
       // skip the check for too small arrays
       if( check_cuda_grids && array.getLocalRange().getSize() > 256 )
-         EXPECT_GT( ( CudaScanKernelLauncher< ScanType, ValueType, IndexType >::gridsCount() ), 1 );
+         EXPECT_GT( ( CudaScanKernelLauncher< ScanType >::gridsCount() ), 1 );
 #endif
 
       array_host = array;
@@ -126,6 +126,60 @@ TYPED_TEST_SUITE( DistributedScanTest, DistributedArrayTypes );
 
 // TODO: test that horizontal operations are computed for ghost values without synchronization
 
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_zero_array )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -174,6 +228,61 @@ TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_zero_array )
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_constant_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -223,6 +332,28 @@ TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_constant_sequen
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_linear_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -243,6 +374,60 @@ TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_linear_sequence
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_zero_array )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -291,6 +476,61 @@ TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_zero_array )
    this->template checkResult< ScanType::Exclusive >( this->a );
 }
 
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_constant_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -340,6 +580,28 @@ TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_constant_sequen
    this->template checkResult< ScanType::Exclusive >( this->a );
 }
 
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_linear_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -361,7 +623,7 @@ TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_linear_sequence
 }
 
 
-TYPED_TEST( DistributedScanTest, inplace_multiplication )
+TYPED_TEST( DistributedScanTest, multiplication )
 {
    this->localRange = Partitioner< typename TestFixture::IndexType >::splitRange( 10, this->group );
    this->input_host.setDistribution( this->localRange, 0, 10, this->group );
@@ -377,6 +639,10 @@ TYPED_TEST( DistributedScanTest, inplace_multiplication )
    }
 
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
    this->template checkResult< ScanType::Exclusive >( this->a );
 
@@ -386,11 +652,15 @@ TYPED_TEST( DistributedScanTest, inplace_multiplication )
 
    this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
-TYPED_TEST( DistributedScanTest, inplace_custom_begin_end )
+TYPED_TEST( DistributedScanTest, custom_begin_end )
 {
    using IndexType = typename TestFixture::IndexType;
 
@@ -409,6 +679,10 @@ TYPED_TEST( DistributedScanTest, inplace_custom_begin_end )
    }
 
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceExclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Exclusive >( this->a, false );
 
@@ -419,11 +693,15 @@ TYPED_TEST( DistributedScanTest, inplace_custom_begin_end )
 
    this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceInclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Inclusive >( this->a, false );
 }
 
-TYPED_TEST( DistributedScanTest, inplace_empty_range )
+TYPED_TEST( DistributedScanTest, empty_range )
 {
    using IndexType = typename TestFixture::IndexType;
 
@@ -437,12 +715,20 @@ TYPED_TEST( DistributedScanTest, inplace_empty_range )
 
    // exclusive scan test
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceExclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Exclusive >( this->a, false );
 
    // inclusive scan test
    this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
    distributedInplaceInclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Inclusive >( this->a, false );
 }
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index 1f7a87837..2ad5453e3 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -59,10 +59,10 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ValueType, IndexType >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ValueType, IndexType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive >::maxGridSize() = 3;
       }
 #endif
    }
@@ -73,7 +73,7 @@ protected:
 #ifdef HAVE_CUDA
       // skip the check for too small arrays
       if( array.getSize() > 256 )
-         EXPECT_GT( ( CudaScanKernelLauncher< ScanType, ValueType, IndexType >::gridsCount() ), 1 );
+         EXPECT_GT( ( CudaScanKernelLauncher< ScanType >::gridsCount() ), 1 );
 #endif
 
       array_host = array;
@@ -140,6 +140,66 @@ using ArrayTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( ScanTest, ArrayTypes );
 
+TYPED_TEST( ScanTest, inclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceInclusiveScan_zero_array )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -188,6 +248,67 @@ TYPED_TEST( ScanTest, inplaceInclusiveScan_zero_array )
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, inclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceInclusiveScan_constant_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -237,6 +358,28 @@ TYPED_TEST( ScanTest, inplaceInclusiveScan_constant_sequence )
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, inclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceInclusiveScan_linear_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -257,6 +400,66 @@ TYPED_TEST( ScanTest, inplaceInclusiveScan_linear_sequence )
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, exclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceExclusiveScan_zero_array )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -305,6 +508,67 @@ TYPED_TEST( ScanTest, inplaceExclusiveScan_zero_array )
    this->template checkResult< ScanType::Exclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, exclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceExclusiveScan_constant_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -354,6 +618,28 @@ TYPED_TEST( ScanTest, inplaceExclusiveScan_constant_sequence )
    this->template checkResult< ScanType::Exclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, exclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
 TYPED_TEST( ScanTest, inplaceExclusiveScan_linear_sequence )
 {
    using ValueType = typename TestFixture::ValueType;
@@ -375,7 +661,7 @@ TYPED_TEST( ScanTest, inplaceExclusiveScan_linear_sequence )
 }
 
 
-TYPED_TEST( ScanTest, inplace_multiplication )
+TYPED_TEST( ScanTest, multiplication )
 {
    this->input_host.setSize( 10 );
    this->input_host.setValue( 2 );
@@ -389,6 +675,10 @@ TYPED_TEST( ScanTest, inplace_multiplication )
    }
 
    this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->a.getSize(), 0, TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
    this->template checkResult< ScanType::Exclusive >( this->a );
 
@@ -396,13 +686,16 @@ TYPED_TEST( ScanTest, inplace_multiplication )
    for( int i = 0; i < this->expected_host.getSize(); i++ )
       this->expected_host[ i ] *= 2;
 
-   this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->a.getSize(), 0, TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
-TYPED_TEST( ScanTest, inplace_custom_begin_end )
+TYPED_TEST( ScanTest, custom_begin_end )
 {
    using IndexType = typename TestFixture::IndexType;
 
@@ -416,6 +709,10 @@ TYPED_TEST( ScanTest, inplace_custom_begin_end )
       this->expected_host[ i ] = i - begin;
 
    this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, begin, end, begin );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceExclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Exclusive >( this->a );
 
@@ -423,13 +720,16 @@ TYPED_TEST( ScanTest, inplace_custom_begin_end )
    for( int i = begin; i < end; i++ )
       this->expected_host[ i ]++;
 
-   this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, begin, end, begin );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceInclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
-TYPED_TEST( ScanTest, inplace_empty_range )
+TYPED_TEST( ScanTest, empty_range )
 {
    using IndexType = typename TestFixture::IndexType;
 
@@ -442,12 +742,19 @@ TYPED_TEST( ScanTest, inplace_empty_range )
 
    // exclusive scan test
    this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, begin, end, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceExclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Exclusive >( this->a );
 
    // inclusive scan test
-   this->a.reset();
    this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, begin, end, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
    inplaceInclusiveScan( this->a, begin, end );
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
-- 
GitLab


From 7a68883366b7c9b0044750a8dc108cc1070a9544 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 17 Jul 2021 18:07:49 +0200
Subject: [PATCH 24/52] Refactored CUDA parallel scan kernel

- input and output are passed by views rather than raw pointers (this
  allows to scan even vector expressions)
- consequently, indexing is different (begin and end for the global
  memory accesses)
- fixed calculation of currentSize in the launcher
- changed configuration of the kernel using the blockSize and
  valuesPerThread template parameters rather than the elementsInBlock
  runtime parameter
- changed allocation of the shared memory from dynamic to static
- the second phase kernel uses shared memory to cache block results for
  each block
---
 src/TNL/Algorithms/detail/CudaScanKernel.h    | 361 ++++++++++--------
 src/TNL/Algorithms/detail/Scan.hpp            |  33 +-
 .../Algorithms/distributedScanTest.h          |  22 ++
 src/UnitTests/Algorithms/scanTest.h           |  22 ++
 4 files changed, 271 insertions(+), 167 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 3f2848d3a..b096c0109 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -22,122 +22,138 @@ namespace detail {
 
 #ifdef HAVE_CUDA
 
-template< typename Real,
-          typename Reduction,
-          typename Index >
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename OutputView,
+          typename Reduction >
 __global__ void
-cudaFirstPhaseBlockScan( const ScanType scanType,
-                         Reduction reduction,
-                         const Real zero,
-                         const Index size,
-                         const int elementsInBlock,
-                         const Real* input,
-                         Real* output,
-                         Real* blockResults )
+CudaScanKernelFirstPhase( const InputView input,
+                          OutputView output,
+                          typename InputView::IndexType begin,
+                          typename InputView::IndexType end,
+                          typename OutputView::IndexType outputBegin,
+                          Reduction reduction,
+                          typename OutputView::ValueType zero,
+                          typename OutputView::ValueType* blockResults )
 {
-   Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
-   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
-   Real* warpSums = &auxData[ blockDim.x ];
-
-   const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
-   const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock );
+   using ValueType = typename OutputView::ValueType;
+   using IndexType = typename InputView::IndexType;
+
+   // verify the configuration
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
+   static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
+                  "blockSize is too large, it would not be possible to scan warpResults using one warp" );
+
+   // calculate indices
+   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+   const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
+   const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
+
+   // update global array offsets for the thread
+   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+   begin += threadOffset;
+   outputBegin += threadOffset;
+
+   // allocate shared memory
+   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
+   __shared__ ValueType sharedData[ shmemElements ];  // accessed via Cuda::getInterleaving()
+   __shared__ ValueType chunkResults[ blockSize ];
+   __shared__ ValueType warpResults[ Cuda::getWarpSize() ];
 
    /***
     * Load data into the shared memory.
     */
-   const int blockOffset = blockIdx.x * elementsInBlock;
    int idx = threadIdx.x;
    if( scanType == ScanType::Exclusive )
    {
       if( idx == 0 )
          sharedData[ 0 ] = zero;
-      while( idx < elementsInBlock && blockOffset + idx < size )
+      while( idx < elementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
+         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ begin ];
+         begin += blockDim.x;
          idx += blockDim.x;
       }
    }
    else
    {
-      while( idx < elementsInBlock && blockOffset + idx < size )
+      while( idx < elementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
+         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
+         begin += blockDim.x;
          idx += blockDim.x;
       }
    }
+   __syncthreads();
 
    /***
-    * Perform the sequential prefix-sum.
+    * Perform the sequential scan of the chunk in shared memory.
     */
-   __syncthreads();
-   const int chunkSize = elementsInBlock / blockDim.x;
-   const int chunkOffset = threadIdx.x * chunkSize;
-   const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize );
-
-   if( chunkOffset < lastElementInBlock )
-   {
-      auxData[ threadIdx.x ] =
-         sharedData[ Cuda::getInterleaving( chunkOffset ) ];
-   }
+   const int chunkOffset = threadIdx.x * valuesPerThread;
+   const int numberOfChunks = roundUpDivision( elementsInBlock, valuesPerThread );
 
    int chunkPointer = 1;
-   while( chunkPointer < chunkSize &&
-          chunkOffset + chunkPointer < lastElementInBlock )
+   while( chunkPointer < valuesPerThread && chunkOffset + chunkPointer < elementsInBlock )
    {
       sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
          reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
                     sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
-      auxData[ threadIdx.x ] =
-         sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
       chunkPointer++;
    }
 
+   // store the result of the sequential reduction of the chunk in chunkResults
+   chunkResults[ threadIdx.x ] = sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ];
+   __syncthreads();
+
    /***
-    *  Perform the parallel prefix-sum inside warps.
+    * Perform the parallel scan on chunkResults inside warps.
     */
    const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
    const int warpIdx = threadIdx.x / Cuda::getWarpSize();
    for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
       if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
-         auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
+         chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], chunkResults[ threadIdx.x - stride ] );
       __syncwarp();
    }
 
    if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
-      warpSums[ warpIdx ] = auxData[ threadIdx.x ];
+      warpResults[ warpIdx ] = chunkResults[ threadIdx.x ];
    __syncthreads();
 
    /****
-    * Compute prefix-sum of warp sums using one warp
+    * Perform the scan of warpResults using one warp.
     */
    if( warpIdx == 0 )
       for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
-            warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
+            warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
          __syncwarp();
       }
    __syncthreads();
 
    /****
-    * Shift the warp prefix-sums.
+    * Shift chunkResults by the warpResults.
     */
    if( warpIdx > 0 )
-      auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
+      chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], warpResults[ warpIdx - 1 ] );
    __syncthreads();
 
    /***
-    *  Store the result back in global memory.
+    * Store the result back in global memory.
     */
    idx = threadIdx.x;
-   while( idx < elementsInBlock && blockOffset + idx < size )
+   while( idx < elementsInBlock )
    {
-      const int chunkIdx = idx / chunkSize;
-      Real chunkShift( zero );
+      const int chunkIdx = idx / valuesPerThread;
+      ValueType chunkShift = zero;
       if( chunkIdx > 0 )
-         chunkShift = auxData[ chunkIdx - 1 ];
+         chunkShift = chunkResults[ chunkIdx - 1 ];
+      output[ outputBegin ] =
       sharedData[ Cuda::getInterleaving( idx ) ] =
          reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
-      output[ blockOffset + idx ] = sharedData[ Cuda::getInterleaving( idx ) ];
+      outputBegin += blockDim.x;
       idx += blockDim.x;
    }
    __syncthreads();
@@ -146,137 +162,165 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
    {
       if( scanType == ScanType::Exclusive )
       {
-         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
-                                                 sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
+         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ],
+                                                 sharedData[ Cuda::getInterleaving( elementsInBlock ) ] );
       }
       else
-         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
+         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ];
    }
 }
 
-template< typename Real,
-          typename Reduction,
-          typename Index >
+template< int blockSize,
+          int valuesPerThread,
+          typename OutputView,
+          typename Reduction >
 __global__ void
-cudaSecondPhaseBlockScan( Reduction reduction,
-                          const Index size,
-                          const int elementsInBlock,
-                          const Index gridIdx,
-                          const Index maxGridSize,
-                          const Real* blockResults,
-                          Real* data,
-                          Real shift )
+CudaScanKernelSecondPhase( OutputView output,
+                           typename OutputView::IndexType outputBegin,
+                           typename OutputView::IndexType outputEnd,
+                           Reduction reduction,
+                           int gridOffset,
+                           const typename OutputView::ValueType* blockResults,
+                           typename OutputView::ValueType shift )
 {
-   shift = reduction( shift, blockResults[ gridIdx * maxGridSize + blockIdx.x ] );
-   const int readOffset = blockIdx.x * elementsInBlock;
-   int readIdx = threadIdx.x;
-   while( readIdx < elementsInBlock && readOffset + readIdx < size )
+   // load the block result into a __shared__ variable first
+   __shared__ typename OutputView::ValueType blockResult;
+   if( threadIdx.x == 0 )
+      blockResult = blockResults[ gridOffset + blockIdx.x ];
+
+   // update the output offset for the thread
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
+   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+   outputBegin += threadOffset;
+
+   // update the block shift
+   __syncthreads();
+   shift = reduction( shift, blockResult );
+
+   int valueIdx = 0;
+   while( valueIdx < valuesPerThread && outputBegin < outputEnd )
    {
-      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
-      readIdx += blockDim.x;
+      output[ outputBegin ] = reduction( output[ outputBegin ], shift );
+      outputBegin += blockDim.x;
+      valueIdx++;
    }
 }
 
-template< ScanType scanType >
+/**
+ * \tparam blockSize  The CUDA block size to be used for kernel launch.
+ * \tparam valuesPerThread  Number of elements processed by each thread sequentially.
+ */
+template< ScanType scanType,
+          int blockSize = 256,
+          int valuesPerThread = 8 >
 struct CudaScanKernelLauncher
 {
    /****
     * \brief Performs both phases of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceInput  Pointer to input data on GPU.
-    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
     * \param reduction  Symmetric binary function representing the reduction operation
     *                   (usually addition, i.e. an instance of \ref std::plus).
     * \param zero  Neutral element for given reduction operation, i.e. value such that
     *              `reduction(zero, x) == x` for any `x`.
-    * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction,
-             typename Real,
-             typename Index >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
    static void
-   perform( const Index size,
-            const Real* deviceInput,
-            Real* deviceOutput,
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
             Reduction&& reduction,
-            const Real zero,
-            const int blockSize = 256 )
+            typename OutputArray::ValueType zero )
    {
       const auto blockShifts = performFirstPhase(
-         size,
-         deviceInput,
-         deviceOutput,
+         input,
+         output,
+         begin,
+         end,
+         outputBegin,
          reduction,
-         zero,
-         blockSize );
+         zero );
       performSecondPhase(
-         size,
-         deviceOutput,
-         blockShifts.getData(),
+         input,
+         output,
+         blockShifts,
+         begin,
+         end,
+         outputBegin,
          reduction,
-         zero,
-         blockSize );
+         zero );
    }
 
    /****
     * \brief Performs the first phase of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceInput  Pointer to input data on GPU.
-    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
     * \param reduction  Symmetric binary function representing the reduction operation
     *                   (usually addition, i.e. an instance of \ref std::plus).
     * \param zero  Neutral value for given reduction operation, i.e. value such that
     *              `reduction(zero, x) == x` for any `x`.
-    * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction,
-             typename Real,
-             typename Index >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
    static auto
-   performFirstPhase( const Index size,
-                      const Real* deviceInput,
-                      Real* deviceOutput,
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
                       Reduction&& reduction,
-                      const Real zero,
-                      const int blockSize = 256 )
+                      typename OutputArray::ValueType zero )
    {
+      using Index = typename InputArray::IndexType;
+
       // compute the number of grids
-      const int elementsInBlock = 8 * blockSize;
-      const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
+      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+      const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
       const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
 
       // allocate array for the block results
-      Containers::Array< Real, Devices::Cuda > blockResults;
+      Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
       blockResults.setSize( numberOfBlocks + 1 );
       blockResults.setElement( 0, zero );
 
       // loop over all grids
       for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid size and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
-         Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize() )
-            currentSize = maxGridSize() * elementsInBlock;
+         // compute current grid offset and size of data to be scanned
+         const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+         const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
 
          // setup block and grid size
          dim3 cudaBlockSize, cudaGridSize;
          cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+         cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
 
          // run the kernel
-         const std::size_t sharedDataSize = elementsInBlock +
-                                            elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
-         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Cuda::getWarpSize() ) * sizeof( Real );
-         cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-            ( scanType,
+         CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+            ( input.getConstView(),
+              output.getView(),
+              begin + gridOffset,
+              begin + gridOffset + currentSize,
+              outputBegin + gridOffset,
               reduction,
               zero,
-              currentSize,
-              elementsInBlock,
-              &deviceInput[ gridOffset ],
-              &deviceOutput[ gridOffset ],
               // blockResults are shifted by 1, because the 0-th element should stay zero
               &blockResults.getData()[ gridIdx * maxGridSize() + 1 ] );
       }
@@ -291,12 +335,13 @@ struct CudaScanKernelLauncher
          // we perform an inclusive scan, but the 0-th is zero and block results
          // were shifted by 1, so effectively we get an exclusive scan
          CudaScanKernelLauncher< ScanType::Inclusive >::perform(
+            blockResults,
+            blockResults,
+            0,
             blockResults.getSize(),
-            blockResults.getData(),
-            blockResults.getData(),
+            0,
             reduction,
-            zero,
-            blockSize );
+            zero );
       }
 
       // Store the number of CUDA grids for the purpose of unit testing, i.e.
@@ -310,55 +355,61 @@ struct CudaScanKernelLauncher
    /****
     * \brief Performs the second phase of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceOutput  Pointer to output array on GPU.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
     * \param blockShifts  Pointer to a GPU array containing the block shifts. It is the
     *                     result of the first phase.
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
     * \param reduction  Symmetric binary function representing the reduction operation
     *                   (usually addition, i.e. an instance of \ref std::plus).
     * \param shift  A constant shifting all elements of the array (usually `zero`, i.e.
     *               the neutral value).
-    * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Reduction,
-             typename Real,
-             typename Index >
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
    static void
-   performSecondPhase( const Index size,
-                       Real* deviceOutput,
-                       const Real* blockShifts,
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
+                       const BlockShifts& blockShifts,
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       const Real shift,
-                       const int blockSize = 256 )
+                       typename OutputArray::ValueType zero )
    {
+      using Index = typename InputArray::IndexType;
+
       // compute the number of grids
-      const int elementsInBlock = 8 * blockSize;
-      const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
+      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+      const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
       const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
 
       // loop over all grids
       for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid size and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
-         Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize() )
-            currentSize = maxGridSize() * elementsInBlock;
+         // compute current grid offset and size of data to be scanned
+         const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+         const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
 
          // setup block and grid size
          dim3 cudaBlockSize, cudaGridSize;
          cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+         cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
 
          // run the kernel
-         cudaSecondPhaseBlockScan<<< cudaGridSize, cudaBlockSize >>>
-            ( reduction,
-              currentSize,
-              elementsInBlock,
-              gridIdx,
-              (Index) maxGridSize(),
-              blockShifts,
-              &deviceOutput[ gridOffset ],
-              shift );
+         CudaScanKernelSecondPhase< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+            ( output.getView(),
+              outputBegin + gridOffset,
+              outputBegin + gridOffset + currentSize,
+              reduction,
+              gridIdx * maxGridSize(),
+              blockShifts.getData(),
+              zero );
       }
 
       // synchronize the null-stream after all grids
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index d982f1731..cda45de0e 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <utility>  // std::forward
+
 #include "Scan.h"
 #include "CudaScanKernel.h"
 
@@ -275,10 +277,12 @@ perform( const InputArray& input,
       return;
 
    detail::CudaScanKernelLauncher< Type >::perform(
-      end - begin,
-      &input.getData()[ begin ],
-      &output.getData()[ outputBegin ],
-      reduction,
+      input,
+      output,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
       zero );
 #else
    throw Exceptions::CudaSupportMissing();
@@ -307,10 +311,12 @@ performFirstPhase( const InputArray& input,
    }
 
    return detail::CudaScanKernelLauncher< Type >::performFirstPhase(
-      end - begin,
-      &input.getData()[ begin ],
-      &output.getData()[ outputBegin ],
-      reduction,
+      input,
+      output,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
       zero );
 #else
    throw Exceptions::CudaSupportMissing();
@@ -338,10 +344,13 @@ performSecondPhase( const InputArray& input,
       return;
 
    detail::CudaScanKernelLauncher< Type >::performSecondPhase(
-      end - begin,
-      &output.getData()[ outputBegin ],
-      blockShifts.getData(),
-      reduction,
+      input,
+      output,
+      blockShifts,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
       zero );
 #else
    throw Exceptions::CudaSupportMissing();
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index 757c82477..0b4e73c97 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -733,6 +733,28 @@ TYPED_TEST( DistributedScanTest, empty_range )
    this->template checkResult< ScanType::Inclusive >( this->a, false );
 }
 
+TYPED_TEST( DistributedScanTest, vector_expression )
+{
+   this->a.setValue( 2 );
+   this->b.setValue( 1 );
+
+   // exclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   this->c.setValue( 0 );
+   distributedExclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->c );
+
+   // inclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ]++;
+
+   this->c.setValue( 0 );
+   distributedInclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->c );
+}
+
 #endif  // HAVE_GTEST
 
 #include "../main_mpi.h"
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index 2ad5453e3..d57c923fa 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -759,6 +759,28 @@ TYPED_TEST( ScanTest, empty_range )
    this->template checkResult< ScanType::Inclusive >( this->a );
 }
 
+TYPED_TEST( ScanTest, vector_expression )
+{
+   this->a.setValue( 2 );
+   this->b.setValue( 1 );
+
+   // exclusive scan test
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   this->c.setValue( 0 );
+   exclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->c );
+
+   // inclusive scan test
+   for( int i = 0; i < this->expected_host.getSize(); i++ )
+      this->expected_host[ i ]++;
+
+   this->c.setValue( 0 );
+   inclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->c );
+}
+
 #endif // HAVE_GTEST
 
 #include "../main.h"
-- 
GitLab


From c37987b198db37c1c1b4de21aaa1d64077e4dd98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 18 Jul 2021 19:49:02 +0200
Subject: [PATCH 25/52] Refactored CUDA parallel scan kernel

The input values are first copied into shared memory, reduced
sequentially across chunks, and scanned only at the end of the kernel.
This follows the upsweep-downsweep approach by Blelloch which is more
work-efficient. Also the distinction between exclusive and inclusive
scan appears only at the end of the kernel, which avoids the weird "+2"
size of the shared memory.

Also used Cuda::getInterleaving() for the indices when accessing the
chunkResults array, which avoids shared memory banks conflicts in the
spine-scan phase.
---
 src/TNL/Algorithms/detail/CudaScanKernel.h | 118 ++++++++++-----------
 1 file changed, 55 insertions(+), 63 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index b096c0109..58b377dd5 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -57,75 +57,63 @@ CudaScanKernelFirstPhase( const InputView input,
    outputBegin += threadOffset;
 
    // allocate shared memory
-   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
+   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks();
    __shared__ ValueType sharedData[ shmemElements ];  // accessed via Cuda::getInterleaving()
-   __shared__ ValueType chunkResults[ blockSize ];
+   __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
    __shared__ ValueType warpResults[ Cuda::getWarpSize() ];
 
-   /***
-    * Load data into the shared memory.
-    */
-   int idx = threadIdx.x;
-   if( scanType == ScanType::Exclusive )
+   // Load data into the shared memory.
    {
-      if( idx == 0 )
-         sharedData[ 0 ] = zero;
+      int idx = threadIdx.x;
       while( idx < elementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ begin ];
+         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
          begin += blockDim.x;
          idx += blockDim.x;
       }
-   }
-   else
-   {
-      while( idx < elementsInBlock )
+      // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
+      // (this helps to avoid divergent branches in the blocks below)
+      while( idx < maxElementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
-         begin += blockDim.x;
+         sharedData[ Cuda::getInterleaving( idx ) ] = zero;
          idx += blockDim.x;
       }
    }
    __syncthreads();
 
-   /***
-    * Perform the sequential scan of the chunk in shared memory.
-    */
+   // Perform sequential reduction of the chunk in shared memory.
    const int chunkOffset = threadIdx.x * valuesPerThread;
-   const int numberOfChunks = roundUpDivision( elementsInBlock, valuesPerThread );
-
-   int chunkPointer = 1;
-   while( chunkPointer < valuesPerThread && chunkOffset + chunkPointer < elementsInBlock )
+   const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x );
    {
-      sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
-         reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                    sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
-      chunkPointer++;
-   }
+      ValueType chunkResult = sharedData[ Cuda::getInterleaving( chunkOffset ) ];
+      #pragma unroll
+      for( int i = 1; i < valuesPerThread; i++ )
+         chunkResult = reduction( chunkResult, sharedData[ Cuda::getInterleaving( chunkOffset + i ) ] );
 
-   // store the result of the sequential reduction of the chunk in chunkResults
-   chunkResults[ threadIdx.x ] = sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ];
+      // store the result of the sequential reduction of the chunk in chunkResults
+      chunkResults[ chunkResultIdx ] = chunkResult;
+   }
    __syncthreads();
 
-   /***
-    * Perform the parallel scan on chunkResults inside warps.
-    */
+   // Perform the parallel scan on chunkResults inside warps.
    const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
    const int warpIdx = threadIdx.x / Cuda::getWarpSize();
+   #pragma unroll
    for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
-      if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
-         chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], chunkResults[ threadIdx.x - stride ] );
+      if( threadInWarpIdx >= stride ) {
+         chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] );
+      }
       __syncwarp();
    }
 
+   // The last thread in warp stores the intermediate result in warpResults.
    if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
-      warpResults[ warpIdx ] = chunkResults[ threadIdx.x ];
+      warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ];
    __syncthreads();
 
-   /****
-    * Perform the scan of warpResults using one warp.
-    */
+   // Perform the scan of warpResults using one warp.
    if( warpIdx == 0 )
+      #pragma unroll
       for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
             warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
@@ -133,40 +121,44 @@ CudaScanKernelFirstPhase( const InputView input,
       }
    __syncthreads();
 
-   /****
-    * Shift chunkResults by the warpResults.
-    */
+   // Shift chunkResults by the warpResults.
    if( warpIdx > 0 )
-      chunkResults[ threadIdx.x ] = reduction( chunkResults[ threadIdx.x ], warpResults[ warpIdx - 1 ] );
+      chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] );
    __syncthreads();
 
-   /***
-    * Store the result back in global memory.
-    */
-   idx = threadIdx.x;
-   while( idx < elementsInBlock )
+   // Downsweep step: scan the chunks and use the chunk result as the initial value.
    {
-      const int chunkIdx = idx / valuesPerThread;
-      ValueType chunkShift = zero;
-      if( chunkIdx > 0 )
-         chunkShift = chunkResults[ chunkIdx - 1 ];
-      output[ outputBegin ] =
-      sharedData[ Cuda::getInterleaving( idx ) ] =
-         reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
-      outputBegin += blockDim.x;
-      idx += blockDim.x;
+      ValueType value = zero;
+      if( threadIdx.x > 0 )
+         value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ];
+
+      #pragma unroll
+      for( int i = 0; i < valuesPerThread; i++ )
+      {
+         const int sharedIdx = Cuda::getInterleaving( chunkOffset + i );
+         const ValueType inputValue = sharedData[ sharedIdx ];
+         if( scanType == ScanType::Exclusive )
+            sharedData[ sharedIdx ] = value;
+         value = reduction( value, inputValue );
+         if( scanType == ScanType::Inclusive )
+            sharedData[ sharedIdx ] = value;
+      }
+
+      // The last thread of the block stores the block result in the global memory.
+      if( blockResults && threadIdx.x == blockDim.x - 1 )
+         blockResults[ blockIdx.x ] = value;
    }
    __syncthreads();
 
-   if( threadIdx.x == 0 )
+   // Store the result back in the global memory.
    {
-      if( scanType == ScanType::Exclusive )
+      int idx = threadIdx.x;
+      while( idx < elementsInBlock )
       {
-         blockResults[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ],
-                                                 sharedData[ Cuda::getInterleaving( elementsInBlock ) ] );
+         output[ outputBegin ] = sharedData[ Cuda::getInterleaving( idx ) ];
+         outputBegin += blockDim.x;
+         idx += blockDim.x;
       }
-      else
-         blockResults[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( elementsInBlock - 1 ) ];
    }
 }
 
-- 
GitLab


From 429bd5114f2d6a8bcbc3486a4ed5d15fc93cd580 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 19 Jul 2021 08:37:18 +0200
Subject: [PATCH 26/52] Refactored CUDA parallel scan kernel

Using an odd number of valuesPerThread avoids shared memory bank
conflicts even without a special interleaving. We also save some shared
memory this way.

Small inputs can be scanned with just one CUDA block, which avoids the
scan of block results and second-phase kernel. Hence, large arrays can
be scanned with just 3 kernel launches instead of 4.
---
 src/TNL/Algorithms/detail/CudaScanKernel.h | 190 ++++++++++++++-------
 1 file changed, 128 insertions(+), 62 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 58b377dd5..8184d7625 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -45,6 +45,9 @@ CudaScanKernelFirstPhase( const InputView input,
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
    static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
                   "blockSize is too large, it would not be possible to scan warpResults using one warp" );
+   static_assert( valuesPerThread % 2,
+                  "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
+                  "when threads access their chunks in sharedData sequentially" );
 
    // calculate indices
    constexpr int maxElementsInBlock = blockSize * valuesPerThread;
@@ -57,8 +60,7 @@ CudaScanKernelFirstPhase( const InputView input,
    outputBegin += threadOffset;
 
    // allocate shared memory
-   constexpr int shmemElements = maxElementsInBlock + maxElementsInBlock / Cuda::getNumberOfSharedMemoryBanks();
-   __shared__ ValueType sharedData[ shmemElements ];  // accessed via Cuda::getInterleaving()
+   __shared__ ValueType sharedData[ maxElementsInBlock ];
    __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
    __shared__ ValueType warpResults[ Cuda::getWarpSize() ];
 
@@ -67,7 +69,7 @@ CudaScanKernelFirstPhase( const InputView input,
       int idx = threadIdx.x;
       while( idx < elementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx ) ] = input[ begin ];
+         sharedData[ idx ] = input[ begin ];
          begin += blockDim.x;
          idx += blockDim.x;
       }
@@ -75,7 +77,7 @@ CudaScanKernelFirstPhase( const InputView input,
       // (this helps to avoid divergent branches in the blocks below)
       while( idx < maxElementsInBlock )
       {
-         sharedData[ Cuda::getInterleaving( idx ) ] = zero;
+         sharedData[ idx ] = zero;
          idx += blockDim.x;
       }
    }
@@ -85,10 +87,10 @@ CudaScanKernelFirstPhase( const InputView input,
    const int chunkOffset = threadIdx.x * valuesPerThread;
    const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x );
    {
-      ValueType chunkResult = sharedData[ Cuda::getInterleaving( chunkOffset ) ];
+      ValueType chunkResult = sharedData[ chunkOffset ];
       #pragma unroll
       for( int i = 1; i < valuesPerThread; i++ )
-         chunkResult = reduction( chunkResult, sharedData[ Cuda::getInterleaving( chunkOffset + i ) ] );
+         chunkResult = reduction( chunkResult, sharedData[ chunkOffset + i ] );
 
       // store the result of the sequential reduction of the chunk in chunkResults
       chunkResults[ chunkResultIdx ] = chunkResult;
@@ -135,13 +137,12 @@ CudaScanKernelFirstPhase( const InputView input,
       #pragma unroll
       for( int i = 0; i < valuesPerThread; i++ )
       {
-         const int sharedIdx = Cuda::getInterleaving( chunkOffset + i );
-         const ValueType inputValue = sharedData[ sharedIdx ];
+         const ValueType inputValue = sharedData[ chunkOffset + i ];
          if( scanType == ScanType::Exclusive )
-            sharedData[ sharedIdx ] = value;
+            sharedData[ chunkOffset + i ] = value;
          value = reduction( value, inputValue );
          if( scanType == ScanType::Inclusive )
-            sharedData[ sharedIdx ] = value;
+            sharedData[ chunkOffset + i ] = value;
       }
 
       // The last thread of the block stores the block result in the global memory.
@@ -155,7 +156,7 @@ CudaScanKernelFirstPhase( const InputView input,
       int idx = threadIdx.x;
       while( idx < elementsInBlock )
       {
-         output[ outputBegin ] = sharedData[ Cuda::getInterleaving( idx ) ];
+         output[ outputBegin ] = sharedData[ idx ];
          outputBegin += blockDim.x;
          idx += blockDim.x;
       }
@@ -181,7 +182,7 @@ CudaScanKernelSecondPhase( OutputView output,
       blockResult = blockResults[ gridOffset + blockIdx.x ];
 
    // update the output offset for the thread
-   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelSecondPhase" );
    constexpr int maxElementsInBlock = blockSize * valuesPerThread;
    const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
    outputBegin += threadOffset;
@@ -205,7 +206,8 @@ CudaScanKernelSecondPhase( OutputView output,
  */
 template< ScanType scanType,
           int blockSize = 256,
-          int valuesPerThread = 8 >
+          // valuesPerThread should be odd to avoid shared memory bank conflicts
+          int valuesPerThread = 7 >
 struct CudaScanKernelLauncher
 {
    /****
@@ -243,6 +245,11 @@ struct CudaScanKernelLauncher
          outputBegin,
          reduction,
          zero );
+
+      // if the first-phase kernel was launched with just one block, skip the second phase
+      if( blockShifts.getSize() <= 2 )
+         return;
+
       performSecondPhase(
          input,
          output,
@@ -283,50 +290,109 @@ struct CudaScanKernelLauncher
    {
       using Index = typename InputArray::IndexType;
 
-      // compute the number of grids
-      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
-      const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
-      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-
-      // allocate array for the block results
-      Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
-      blockResults.setSize( numberOfBlocks + 1 );
-      blockResults.setElement( 0, zero );
-
-      // loop over all grids
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid offset and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
-         const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
-
-         // setup block and grid size
-         dim3 cudaBlockSize, cudaGridSize;
-         cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
-
-         // run the kernel
-         CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
-            ( input.getConstView(),
-              output.getView(),
-              begin + gridOffset,
-              begin + gridOffset + currentSize,
-              outputBegin + gridOffset,
-              reduction,
-              zero,
-              // blockResults are shifted by 1, because the 0-th element should stay zero
-              &blockResults.getData()[ gridIdx * maxGridSize() + 1 ] );
+      if( end - begin <= blockSize * valuesPerThread ) {
+         // allocate array for the block results
+         Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
+         blockResults.setSize( 2 );
+         blockResults.setElement( 0, zero );
+
+         // run the kernel with just 1 block
+         if( end - begin <= blockSize )
+            CudaScanKernelFirstPhase< scanType, blockSize, 1 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 zero,
+                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 &blockResults.getData()[ 1 ] );
+         else if( end - begin <= blockSize * 3 )
+            CudaScanKernelFirstPhase< scanType, blockSize, 3 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 zero,
+                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 &blockResults.getData()[ 1 ] );
+         else if( end - begin <= blockSize * 5 )
+            CudaScanKernelFirstPhase< scanType, blockSize, 5 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 zero,
+                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 &blockResults.getData()[ 1 ] );
+         else
+            CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 zero,
+                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 &blockResults.getData()[ 1 ] );
+
+         // synchronize the null-stream
+         cudaStreamSynchronize(0);
+         TNL_CHECK_CUDA_DEVICE;
+
+         // Store the number of CUDA grids for the purpose of unit testing, i.e.
+         // to check if we test the algorithm with more than one CUDA grid.
+         gridsCount() = 1;
+
+         // blockResults now contains shift values for each block - to be used in the second phase
+         return blockResults;
       }
-
-      // synchronize the null-stream after all grids
-      cudaStreamSynchronize(0);
-      TNL_CHECK_CUDA_DEVICE;
-
-      // blockResults now contains scan results for each block. The first phase
-      // ends by computing an exclusive scan of this array.
-      if( numberOfBlocks > 1 ) {
-         // we perform an inclusive scan, but the 0-th is zero and block results
-         // were shifted by 1, so effectively we get an exclusive scan
-         CudaScanKernelLauncher< ScanType::Inclusive >::perform(
+      else {
+         // compute the number of grids
+         constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+         const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
+         const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+
+         // allocate array for the block results
+         Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
+         blockResults.setSize( numberOfBlocks + 1 );
+
+         // loop over all grids
+         for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+            // compute current grid offset and size of data to be scanned
+            const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+            const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
+
+            // setup block and grid size
+            dim3 cudaBlockSize, cudaGridSize;
+            cudaBlockSize.x = blockSize;
+            cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
+
+            // run the kernel
+            CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin + gridOffset,
+                 begin + gridOffset + currentSize,
+                 outputBegin + gridOffset,
+                 reduction,
+                 zero,
+                 &blockResults.getData()[ gridIdx * maxGridSize() ] );
+         }
+
+         // synchronize the null-stream after all grids
+         cudaStreamSynchronize(0);
+         TNL_CHECK_CUDA_DEVICE;
+
+         // blockResults now contains scan results for each block. The first phase
+         // ends by computing an exclusive scan of this array.
+         CudaScanKernelLauncher< ScanType::Exclusive >::perform(
             blockResults,
             blockResults,
             0,
@@ -334,14 +400,14 @@ struct CudaScanKernelLauncher
             0,
             reduction,
             zero );
-      }
 
-      // Store the number of CUDA grids for the purpose of unit testing, i.e.
-      // to check if we test the algorithm with more than one CUDA grid.
-      gridsCount() = numberOfGrids;
+         // Store the number of CUDA grids for the purpose of unit testing, i.e.
+         // to check if we test the algorithm with more than one CUDA grid.
+         gridsCount() = numberOfGrids;
 
-      // blockResults now contains shift values for each block - to be used in the second phase
-      return blockResults;
+         // blockResults now contains shift values for each block - to be used in the second phase
+         return blockResults;
+      }
    }
 
    /****
-- 
GitLab


From addb7566cbe458a93dd9147e1eb32c993974f1af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 19 Jul 2021 19:24:54 +0200
Subject: [PATCH 27/52] Optimized upper bound for the scan of warpResults in
 the CUDA parallel scan

---
 src/TNL/Algorithms/detail/CudaScanKernel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 8184d7625..a7bd55bf0 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -116,7 +116,7 @@ CudaScanKernelFirstPhase( const InputView input,
    // Perform the scan of warpResults using one warp.
    if( warpIdx == 0 )
       #pragma unroll
-      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
+      for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
             warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
          __syncwarp();
-- 
GitLab


From 3bd8fff52dfb529d74f2f3e0d7f67cabd712a7ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 20 Jul 2021 09:47:31 +0200
Subject: [PATCH 28/52] Refactored CudaReductionKernel - split the parallel
 reduction into CudaBlockReduce

---
 .../Algorithms/detail/CudaReductionKernel.h   | 475 +++++++++++-------
 1 file changed, 288 insertions(+), 187 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 51f38f18b..2c5c0ddf1 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -23,23 +23,7 @@ namespace TNL {
 namespace Algorithms {
 namespace detail {
 
-/****
- * The performance of this kernel is very sensitive to register usage.
- * Compile with --ptxas-options=-v and configure these constants for given
- * architecture so that there are no local memory spills.
- */
-static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
-static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value
-
 #ifdef HAVE_CUDA
-// __CUDA_ARCH__ is defined only in device code!
-#if (__CUDA_ARCH__ == 750 )
-   // Turing has a limit of 1024 threads per multiprocessor
-   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
-#else
-   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
-#endif
-
 /*
  * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
  * std::plus<void>::operator() function to be constexpr and hence __host__ __device__
@@ -64,6 +48,215 @@ auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg
 #endif
 }
 
+/* Template for cooperative reduction across the CUDA block of threads.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
+template< int blockSize,
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockReduce
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      // when there is only one warp per blockSize.x, we need to allocate two warps
+      // worth of shared memory so that we don't index shared memory out of bounds
+      ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+   };
+
+   /* Cooperative reduction across the CUDA block - each thread will get the
+    * result of the reduction
+    *
+    * \param reduction   The binary reduction functor.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   ValueType
+   reduce( const Reduction& reduction,
+           ValueType threadValue,
+           int tid,
+           Storage& storage )
+   {
+      storage.data[ tid ] = threadValue;
+      __syncthreads();
+
+      if( blockSize >= 1024 ) {
+         if( tid < 512 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 512 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 512 ) {
+         if( tid < 256 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 256 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 256 ) {
+         if( tid < 128 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 128 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 128 ) {
+         if( tid <  64 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 64 ] );
+         __syncthreads();
+      }
+
+      // This runs in one warp so we use __syncwarp() instead of __syncthreads().
+      if( tid < 32 ) {
+         if( blockSize >= 64 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 32 ] );
+         __syncwarp();
+         // Note that here we do not have to check if tid < 16 etc, because we have
+         // 2 * blockSize.x elements of shared memory per block, so we do not
+         // access out of bounds. The results for the upper half will be undefined,
+         // but unused anyway.
+         if( blockSize >= 32 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 16 ] );
+         __syncwarp();
+         if( blockSize >= 16 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 8 ] );
+         __syncwarp();
+         if( blockSize >=  8 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 4 ] );
+         __syncwarp();
+         if( blockSize >=  4 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 2 ] );
+         __syncwarp();
+         if( blockSize >=  2 )
+            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 1 ] );
+      }
+
+      __syncthreads();
+      return storage.data[ 0 ];
+   }
+};
+
+/* Template for cooperative reduction with argument across the CUDA block of
+ * threads. It is a *cooperative* operation - all threads must call the
+ * operation, otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
+template< int blockSize,
+          typename Reduction,
+          typename ValueType,
+          typename IndexType >
+struct CudaBlockReduceWithArgument
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      // when there is only one warp per blockSize.x, we need to allocate two warps
+      // worth of shared memory so that we don't index shared memory out of bounds
+      ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+      IndexType idx [ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+   };
+
+   /* Cooperative reduction with argument across the CUDA block - each thread
+    * will get the pair of the result of the reduction and the index
+    *
+    * \param reduction   The binary reduction functor.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param threadIndex Index value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   std::pair< ValueType, IndexType >
+   reduceWithArgument( const Reduction& reduction,
+                       ValueType threadValue,
+                       IndexType threadIndex,
+                       int tid,
+                       Storage& storage )
+   {
+      storage.data[ tid ] = threadValue;
+      storage.idx[ tid ] = threadIndex;
+      __syncthreads();
+
+      if( blockSize >= 1024 ) {
+         if( tid < 512 )
+            reduction( storage.data[ tid ], storage.data[ tid + 512 ], storage.idx[ tid ], storage.idx[ tid + 512 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 512 ) {
+         if( tid < 256 )
+            reduction( storage.data[ tid ], storage.data[ tid + 256 ], storage.idx[ tid ], storage.idx[ tid + 256 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 256 ) {
+         if( tid < 128 )
+            reduction( storage.data[ tid ], storage.data[ tid + 128 ], storage.idx[ tid ], storage.idx[ tid + 128 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 128 ) {
+         if( tid <  64 )
+            reduction( storage.data[ tid ], storage.data[ tid + 64 ], storage.idx[ tid ], storage.idx[ tid + 64 ] );
+         __syncthreads();
+      }
+
+      // This runs in one warp so we use __syncwarp() instead of __syncthreads().
+      if( tid < 32 ) {
+         if( blockSize >= 64 )
+            reduction( storage.data[ tid ], storage.data[ tid + 32 ], storage.idx[ tid ], storage.idx[ tid + 32 ] );
+         __syncwarp();
+         // Note that here we do not have to check if tid < 16 etc, because we have
+         // 2 * blockSize.x elements of shared memory per block, so we do not
+         // access out of bounds. The results for the upper half will be undefined,
+         // but unused anyway.
+         if( blockSize >= 32 )
+            reduction( storage.data[ tid ], storage.data[ tid + 16 ], storage.idx[ tid ], storage.idx[ tid + 16 ] );
+         __syncwarp();
+         if( blockSize >= 16 )
+            reduction( storage.data[ tid ], storage.data[ tid + 8 ], storage.idx[ tid ], storage.idx[ tid + 8 ] );
+         __syncwarp();
+         if( blockSize >=  8 )
+            reduction( storage.data[ tid ], storage.data[ tid + 4 ], storage.idx[ tid ], storage.idx[ tid + 4 ] );
+         __syncwarp();
+         if( blockSize >=  4 )
+            reduction( storage.data[ tid ], storage.data[ tid + 2 ], storage.idx[ tid ], storage.idx[ tid + 2 ] );
+         __syncwarp();
+         if( blockSize >=  2 )
+            reduction( storage.data[ tid ], storage.data[ tid + 1 ], storage.idx[ tid ], storage.idx[ tid + 1 ] );
+      }
+
+      __syncthreads();
+      return std::make_pair( storage.data[ 0 ], storage.idx[ 0 ] );
+   }
+};
+#endif
+
+/****
+ * The performance of this kernel is very sensitive to register usage.
+ * Compile with --ptxas-options=-v and configure these constants for given
+ * architecture so that there are no local memory spills.
+ */
+static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
+static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value
+
+#ifdef HAVE_CUDA
+// __CUDA_ARCH__ is defined only in device code!
+#if (__CUDA_ARCH__ == 750 )
+   // Turing has a limit of 1024 threads per multiprocessor
+   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
+#else
+   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
+#endif
+
 template< int blockSize,
           typename Result,
           typename DataFetcher,
@@ -71,95 +264,49 @@ template< int blockSize,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( const Result zero,
+CudaReductionKernel( Result initialValue,
                      DataFetcher dataFetcher,
                      const Reduction reduction,
-                     const Index begin,
-                     const Index end,
+                     Index begin,
+                     Index end,
                      Result* output )
 {
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
-   __shared__ Result sdata[shmemElements];
-
-   // Get the thread id (tid), global thread id (gid) and gridSize.
-   const Index tid = threadIdx.x;
-         Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
-   const Index gridSize = blockDim.x * gridDim.x;
 
-   sdata[ tid ] = zero;
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduce< blockSize, Reduction, Result >;
+   __shared__ typename BlockReduce::Storage storage;
+
+   // Calculate the grid size (stride of the sequential reduction loop).
+   const Index gridSize = blockDim.x * gridDim.x;
+   // Shift the input lower bound by the thread index in the grid.
+   begin += blockIdx.x * blockDim.x + threadIdx.x;
 
    // Start with the sequential reduction and push the result into the shared memory.
-   while( gid + 4 * gridSize < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
-      gid += 4 * gridSize;
+   while( begin + 4 * gridSize < end ) {
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 2 * gridSize ) );
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 3 * gridSize ) );
+      begin += 4 * gridSize;
    }
-   while( gid + 2 * gridSize < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
-      gid += 2 * gridSize;
+   while( begin + 2 * gridSize < end ) {
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
+      begin += 2 * gridSize;
    }
-   while( gid < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      gid += gridSize;
+   while( begin < end ) {
+      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
+      begin += gridSize;
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   if( blockSize >= 1024 ) {
-      if( tid < 512 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 512 ) {
-      if( tid < 256 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 256 ) {
-      if( tid < 128 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 128 ) {
-      if( tid <  64 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] );
-      __syncthreads();
-   }
-
-   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
-   if( tid < 32 ) {
-      if( blockSize >= 64 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] );
-      __syncwarp();
-      // Note that here we do not have to check if tid < 16 etc, because we have
-      // 2 * blockSize.x elements of shared memory per block, so we do not
-      // access out of bounds. The results for the upper half will be undefined,
-      // but unused anyway.
-      if( blockSize >= 32 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] );
-      __syncwarp();
-      if( blockSize >= 16 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] );
-      __syncwarp();
-      if( blockSize >=  8 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] );
-      __syncwarp();
-      if( blockSize >=  4 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] );
-      __syncwarp();
-      if( blockSize >=  2 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] );
-   }
+   initialValue = BlockReduce::reduce( reduction, initialValue, threadIdx.x, storage );
 
    // Store the result back in the global memory.
-   if( tid == 0 )
-      output[ blockIdx.x ] = sdata[ 0 ];
+   if( threadIdx.x == 0 )
+      output[ blockIdx.x ] = initialValue;
 }
 
 template< int blockSize,
@@ -169,131 +316,85 @@ template< int blockSize,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionWithArgumentKernel( const Result zero,
+CudaReductionWithArgumentKernel( Result initialValue,
                                  DataFetcher dataFetcher,
                                  const Reduction reduction,
-                                 const Index begin,
-                                 const Index end,
+                                 Index begin,
+                                 Index end,
                                  Result* output,
                                  Index* idxOutput,
                                  const Index* idxInput = nullptr )
 {
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
-   __shared__ Result sdata[shmemElements];
-   __shared__ Index sidx[shmemElements];
-
-   // Get the thread id (tid), global thread id (gid) and gridSize.
-   const Index tid = threadIdx.x;
-         Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
+
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduceWithArgument< blockSize, Reduction, Result, Index >;
+   __shared__ typename BlockReduce::Storage storage;
+
+   // Calculate the grid size (stride of the sequential reduction loop).
    const Index gridSize = blockDim.x * gridDim.x;
+   // Shift the input lower bound by the thread index in the grid.
+   begin += blockIdx.x * blockDim.x + threadIdx.x;
+
+   // TODO: initialIndex should be passed as an argument to the kernel
+   Index initialIndex;
 
    // Start with the sequential reduction and push the result into the shared memory.
    if( idxInput ) {
-      if( gid < end ) {
-         sdata[ tid ] = dataFetcher( gid );
-         sidx[ tid ] = idxInput[ gid ];
-         gid += gridSize;
-      } else {
-         sdata[ tid ] = zero;
+      if( begin < end ) {
+         initialValue = dataFetcher( begin );
+         initialIndex = idxInput[ begin ];
+         begin += gridSize;
       }
-      while( gid + 4 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
-         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] );
-         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] );
-         gid += 4 * gridSize;
+      while( begin + 4 * gridSize < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, idxInput[ begin + 2 * gridSize ] );
+         reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, idxInput[ begin + 3 * gridSize ] );
+         begin += 4 * gridSize;
       }
-      while( gid + 2 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
-         gid += 2 * gridSize;
+      while( begin + 2 * gridSize < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         begin += 2 * gridSize;
       }
-      while( gid < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         gid += gridSize;
+      while( begin < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         begin += gridSize;
       }
    }
    else {
-      if( gid < end ) {
-         sdata[ tid ] = dataFetcher( gid );
-         sidx[ tid ] = gid;
-         gid += gridSize;
-      } else {
-         sdata[ tid ] = zero;
+      if( begin < end ) {
+         initialValue = dataFetcher( begin );
+         initialIndex = begin;
+         begin += gridSize;
       }
-      while( gid + 4 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
-         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize );
-         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize );
-         gid += 4 * gridSize;
+      while( begin + 4 * gridSize < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
+         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, begin + 2 * gridSize );
+         reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, begin + 3 * gridSize );
+         begin += 4 * gridSize;
       }
-      while( gid + 2 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
-         gid += 2 * gridSize;
+      while( begin + 2 * gridSize < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
+         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         begin += 2 * gridSize;
       }
-      while( gid < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         gid += gridSize;
+      while( begin < end ) {
+         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
+         begin += gridSize;
       }
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   if( blockSize >= 1024 ) {
-      if( tid < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 512 ) {
-      if( tid < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 256 ) {
-      if( tid < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 128 ) {
-      if( tid <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] );
-      __syncthreads();
-   }
-
-   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
-   if( tid < 32 ) {
-      if( blockSize >= 64 )
-         reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] );
-      __syncwarp();
-      // Note that here we do not have to check if tid < 16 etc, because we have
-      // 2 * blockSize.x elements of shared memory per block, so we do not
-      // access out of bounds. The results for the upper half will be undefined,
-      // but unused anyway.
-      if( blockSize >= 32 )
-         reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] );
-      __syncwarp();
-      if( blockSize >= 16 )
-         reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] );
-      __syncwarp();
-      if( blockSize >=  8 )
-         reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] );
-      __syncwarp();
-      if( blockSize >=  4 )
-         reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] );
-      __syncwarp();
-      if( blockSize >=  2 )
-         reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] );
-   }
+   const std::pair< Result, Index > result = BlockReduce::reduceWithArgument( reduction, initialValue, initialIndex, threadIdx.x, storage );
 
    // Store the result back in the global memory.
-   if( tid == 0 ) {
-      output[ blockIdx.x ] = sdata[ 0 ];
-      idxOutput[ blockIdx.x ] = sidx[ 0 ];
+   if( threadIdx.x == 0 ) {
+      output[ blockIdx.x ] = result.first;
+      idxOutput[ blockIdx.x ] = result.second;
    }
 }
 #endif
-- 
GitLab


From 2f61104b58e1461ebf0863801a01c8ef0b148fc2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Tue, 20 Jul 2021 11:45:46 +0200
Subject: [PATCH 29/52] Refactored CudaScanKernelFirstPhase - split out
 CudaBlockScan and CudaTileScan

---
 src/TNL/Algorithms/detail/CudaScanKernel.h | 333 ++++++++++++++-------
 1 file changed, 223 insertions(+), 110 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index a7bd55bf0..aa5fa70c1 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -21,146 +21,259 @@ namespace Algorithms {
 namespace detail {
 
 #ifdef HAVE_CUDA
-
+/* Template for cooperative scan across the CUDA block of threads.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
 template< ScanType scanType,
           int blockSize,
-          int valuesPerThread,
-          typename InputView,
-          typename OutputView,
-          typename Reduction >
-__global__ void
-CudaScanKernelFirstPhase( const InputView input,
-                          OutputView output,
-                          typename InputView::IndexType begin,
-                          typename InputView::IndexType end,
-                          typename OutputView::IndexType outputBegin,
-                          Reduction reduction,
-                          typename OutputView::ValueType zero,
-                          typename OutputView::ValueType* blockResults )
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockScan
 {
-   using ValueType = typename OutputView::ValueType;
-   using IndexType = typename InputView::IndexType;
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
 
-   // verify the configuration
-   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelFirstPhase" );
-   static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
-                  "blockSize is too large, it would not be possible to scan warpResults using one warp" );
-   static_assert( valuesPerThread % 2,
-                  "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
-                  "when threads access their chunks in sharedData sequentially" );
+   /* Cooperative scan across the CUDA block - each thread will get the
+    * result of the scan according to its ID.
+    *
+    * \param reduction    The binary reduction functor.
+    * \param zero         Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(zero, x) == x` for any `x`.
+    * \param threadValue  Value of the calling thread to be reduced.
+    * \param tid          Index of the calling thread (usually `threadIdx.x`,
+    *                     unless you know what you are doing).
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
+    */
+   __device__ static
+   ValueType
+   scan( const Reduction& reduction,
+         ValueType zero,
+         ValueType threadValue,
+         int tid,
+         Storage& storage )
+   {
+      // verify the configuration
+      TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaBlockScan::scan" );
+      static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
+                     "blockSize is too large, it would not be possible to scan warpResults using one warp" );
+
+      // Store the threadValue in the shared memory.
+      const int chunkResultIdx = Cuda::getInterleaving( tid );
+      storage.chunkResults[ chunkResultIdx ] = threadValue;
+      __syncthreads();
+
+      // Perform the parallel scan on chunkResults inside warps.
+      const int threadInWarpIdx = tid % Cuda::getWarpSize();
+      const int warpIdx = tid / Cuda::getWarpSize();
+      #pragma unroll
+      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
+         if( threadInWarpIdx >= stride ) {
+            storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
+         }
+         __syncwarp();
+      }
+      threadValue = storage.chunkResults[ chunkResultIdx ];
+
+      // The last thread in warp stores the intermediate result in warpResults.
+      if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
+         storage.warpResults[ warpIdx ] = threadValue;
+      __syncthreads();
+
+      // Perform the scan of warpResults using one warp.
+      if( warpIdx == 0 )
+         #pragma unroll
+         for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
+            if( threadInWarpIdx >= stride )
+               storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
+            __syncwarp();
+         }
+      __syncthreads();
 
-   // calculate indices
-   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
-   const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
-   const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
+      // Shift threadValue by the warpResults.
+      if( warpIdx > 0 )
+         threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] );
 
-   // update global array offsets for the thread
-   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
-   begin += threadOffset;
-   outputBegin += threadOffset;
+      // Shift the result for exclusive scan.
+      if( scanType == ScanType::Exclusive ) {
+         storage.chunkResults[ chunkResultIdx ] = threadValue;
+         __syncthreads();
+         threadValue = (tid == 0) ? zero : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ];
+      }
 
-   // allocate shared memory
-   __shared__ ValueType sharedData[ maxElementsInBlock ];
-   __shared__ ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
-   __shared__ ValueType warpResults[ Cuda::getWarpSize() ];
+      __syncthreads();
+      return threadValue;
+   }
+};
 
-   // Load data into the shared memory.
+/* Template for cooperative scan of a data tile in the global memory.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ */
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename Reduction,
+          typename ValueType >
+struct CudaTileScan
+{
+   using BlockScan = CudaBlockScan< ScanType::Exclusive, blockSize, Reduction, ValueType >;
+
+   // storage to be allocated in shared memory
+   struct Storage
    {
-      int idx = threadIdx.x;
-      while( idx < elementsInBlock )
-      {
-         sharedData[ idx ] = input[ begin ];
-         begin += blockDim.x;
-         idx += blockDim.x;
-      }
-      // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
-      // (this helps to avoid divergent branches in the blocks below)
-      while( idx < maxElementsInBlock )
-      {
-         sharedData[ idx ] = zero;
-         idx += blockDim.x;
-      }
-   }
-   __syncthreads();
+      ValueType data[ blockSize * valuesPerThread ];
+      typename BlockScan::Storage blockScanStorage;
+   };
 
-   // Perform sequential reduction of the chunk in shared memory.
-   const int chunkOffset = threadIdx.x * valuesPerThread;
-   const int chunkResultIdx = Cuda::getInterleaving( threadIdx.x );
+   /* Cooperative scan of a data tile in the global memory - each thread will
+    * get the result of its chunk (i.e. the last value of the (inclusive) scan
+    * in the chunk) according to the thread ID.
+    *
+    * \param input        The input array to be scanned.
+    * \param output       The array where the result will be stored.
+    * \param begin        The first element in the array to be scanned.
+    * \param end          the last element in the array to be scanned.
+    * \param outputBegin  The first element in the output array to be written. There
+    *                     must be at least `end - begin` elements in the output
+    *                     array starting at the position given by `outputBegin`.
+    * \param reduction    The binary reduction functor.
+    * \param zero         Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(zero, x) == x` for any `x`.
+    * \param shift        A global shift to be applied to all elements in the
+    *                     chunk processed by this thread.
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
+    */
+   template< typename InputView,
+             typename OutputView >
+   __device__ static
+   ValueType
+   scan( const InputView input,
+         OutputView output,
+         typename InputView::IndexType begin,
+         typename InputView::IndexType end,
+         typename OutputView::IndexType outputBegin,
+         const Reduction& reduction,
+         ValueType zero,
+         ValueType shift,
+         Storage& storage )
    {
-      ValueType chunkResult = sharedData[ chunkOffset ];
-      #pragma unroll
-      for( int i = 1; i < valuesPerThread; i++ )
-         chunkResult = reduction( chunkResult, sharedData[ chunkOffset + i ] );
+      // verify the configuration
+      TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaTileScan::scan" );
+      static_assert( valuesPerThread % 2,
+                     "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
+                     "when threads access their chunks in shared memory sequentially" );
 
-      // store the result of the sequential reduction of the chunk in chunkResults
-      chunkResults[ chunkResultIdx ] = chunkResult;
-   }
-   __syncthreads();
+      // calculate indices
+      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+      const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
+      const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
 
-   // Perform the parallel scan on chunkResults inside warps.
-   const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
-   const int warpIdx = threadIdx.x / Cuda::getWarpSize();
-   #pragma unroll
-   for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
-      if( threadInWarpIdx >= stride ) {
-         chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], chunkResults[ Cuda::getInterleaving( threadIdx.x - stride ) ] );
-      }
-      __syncwarp();
-   }
+      // update global array offsets for the thread
+      const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+      begin += threadOffset;
+      outputBegin += threadOffset;
 
-   // The last thread in warp stores the intermediate result in warpResults.
-   if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
-      warpResults[ warpIdx ] = chunkResults[ chunkResultIdx ];
-   __syncthreads();
+      // Load data into the shared memory.
+      {
+         int idx = threadIdx.x;
+         while( idx < elementsInBlock )
+         {
+            storage.data[ idx ] = input[ begin ];
+            begin += blockDim.x;
+            idx += blockDim.x;
+         }
+         // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
+         // (this helps to avoid divergent branches in the blocks below)
+         while( idx < maxElementsInBlock )
+         {
+            storage.data[ idx ] = zero;
+            idx += blockDim.x;
+         }
+      }
+      __syncthreads();
 
-   // Perform the scan of warpResults using one warp.
-   if( warpIdx == 0 )
+      // Perform sequential reduction of the thread's chunk in shared memory.
+      const int chunkOffset = threadIdx.x * valuesPerThread;
+      ValueType value = storage.data[ chunkOffset ];
       #pragma unroll
-      for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
-         if( threadInWarpIdx >= stride )
-            warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
-         __syncwarp();
-      }
-   __syncthreads();
+      for( int i = 1; i < valuesPerThread; i++ )
+         value = reduction( value, storage.data[ chunkOffset + i ] );
 
-   // Shift chunkResults by the warpResults.
-   if( warpIdx > 0 )
-      chunkResults[ chunkResultIdx ] = reduction( chunkResults[ chunkResultIdx ], warpResults[ warpIdx - 1 ] );
-   __syncthreads();
+      // Scan the spine to obtain the initial value ("offset") for the downsweep.
+      value = BlockScan::scan( reduction, zero, value, threadIdx.x, storage.blockScanStorage );
 
-   // Downsweep step: scan the chunks and use the chunk result as the initial value.
-   {
-      ValueType value = zero;
-      if( threadIdx.x > 0 )
-         value = chunkResults[ Cuda::getInterleaving( threadIdx.x - 1 ) ];
+      // Apply the global shift.
+      value = reduction( value, shift );
 
+      // Downsweep step: scan the chunks and use the result of spine scan as the initial value.
       #pragma unroll
       for( int i = 0; i < valuesPerThread; i++ )
       {
-         const ValueType inputValue = sharedData[ chunkOffset + i ];
+         const ValueType inputValue = storage.data[ chunkOffset + i ];
          if( scanType == ScanType::Exclusive )
-            sharedData[ chunkOffset + i ] = value;
+            storage.data[ chunkOffset + i ] = value;
          value = reduction( value, inputValue );
          if( scanType == ScanType::Inclusive )
-            sharedData[ chunkOffset + i ] = value;
+            storage.data[ chunkOffset + i ] = value;
       }
+      __syncthreads();
 
-      // The last thread of the block stores the block result in the global memory.
-      if( blockResults && threadIdx.x == blockDim.x - 1 )
-         blockResults[ blockIdx.x ] = value;
-   }
-   __syncthreads();
-
-   // Store the result back in the global memory.
-   {
-      int idx = threadIdx.x;
-      while( idx < elementsInBlock )
+      // Store the result back in the global memory.
       {
-         output[ outputBegin ] = sharedData[ idx ];
-         outputBegin += blockDim.x;
-         idx += blockDim.x;
+         int idx = threadIdx.x;
+         while( idx < elementsInBlock )
+         {
+            output[ outputBegin ] = storage.data[ idx ];
+            outputBegin += blockDim.x;
+            idx += blockDim.x;
+         }
       }
+
+      // Return the last (inclusive) scan value of the chunk processed by this thread.
+      return value;
    }
+};
+
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename OutputView,
+          typename Reduction >
+__global__ void
+CudaScanKernelFirstPhase( const InputView input,
+                          OutputView output,
+                          typename InputView::IndexType begin,
+                          typename InputView::IndexType end,
+                          typename OutputView::IndexType outputBegin,
+                          Reduction reduction,
+                          typename OutputView::ValueType zero,
+                          typename OutputView::ValueType* blockResults )
+{
+   using ValueType = typename OutputView::ValueType;
+   using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
+
+   // allocate shared memory
+   __shared__ typename TileScan::Storage storage;
+
+   // scan from input into output
+   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, zero, storage );
+
+   // The last thread of the block stores the block result in the global memory.
+   if( blockResults && threadIdx.x == blockDim.x - 1 )
+      blockResults[ blockIdx.x ] = value;
 }
 
 template< int blockSize,
-- 
GitLab


From 8accbc52f540104636951a3f2811ca93b98cdcec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Jul 2021 12:04:31 +0200
Subject: [PATCH 30/52] Optimized parallel CUDA scan algorithm to avoid
 unnecessary writing in the first phase

The original approach (prescan + uniform shift) is more efficient for
inputs that are expensive to evaluate, such as vector expressions.
---
 src/TNL/Algorithms/detail/CudaScanKernel.h    | 273 ++++++++++++++----
 src/TNL/Algorithms/detail/DistributedScan.h   |   8 +-
 src/TNL/Algorithms/detail/Scan.h              |  23 +-
 src/TNL/Algorithms/detail/Scan.hpp            |  60 ++--
 src/TNL/Algorithms/detail/ScanType.h          |   5 +
 src/TNL/Algorithms/distributedScan.h          |  10 +-
 src/TNL/Algorithms/scan.h                     |  10 +-
 .../Algorithms/distributedScanTest.h          |  20 +-
 src/UnitTests/Algorithms/scanTest.h           |  20 +-
 9 files changed, 313 insertions(+), 116 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index aa5fa70c1..4d174a1c5 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -246,6 +246,82 @@ struct CudaTileScan
    }
 };
 
+/* CudaScanKernelUpsweep - compute partial reductions per each CUDA block.
+ */
+template< int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename Reduction,
+          typename ValueType >
+__global__ void
+CudaScanKernelUpsweep( const InputView input,
+                       typename InputView::IndexType begin,
+                       typename InputView::IndexType end,
+                       Reduction reduction,
+                       ValueType zero,
+                       ValueType* reductionResults )
+{
+   // verify the configuration
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelUpsweep" );
+   static_assert( valuesPerThread % 2,
+                  "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
+                  "when threads access their chunks in shared memory sequentially" );
+
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduce< blockSize, Reduction, ValueType >;
+   union Shared {
+      ValueType data[ blockSize * valuesPerThread ];
+      typename BlockReduce::Storage blockReduceStorage;
+   };
+   __shared__ Shared storage;
+
+   // calculate indices
+   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+   const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
+   const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
+
+   // update global array offset for the thread
+   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+   begin += threadOffset;
+
+   // Load data into the shared memory.
+   {
+      int idx = threadIdx.x;
+      while( idx < elementsInBlock )
+      {
+         storage.data[ idx ] = input[ begin ];
+         begin += blockDim.x;
+         idx += blockDim.x;
+      }
+      // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
+      // (this helps to avoid divergent branches in the blocks below)
+      while( idx < maxElementsInBlock )
+      {
+         storage.data[ idx ] = zero;
+         idx += blockDim.x;
+      }
+   }
+   __syncthreads();
+
+   // Perform sequential reduction of the thread's chunk in shared memory.
+   const int chunkOffset = threadIdx.x * valuesPerThread;
+   ValueType value = storage.data[ chunkOffset ];
+   #pragma unroll
+   for( int i = 1; i < valuesPerThread; i++ )
+      value = reduction( value, storage.data[ chunkOffset + i ] );
+   __syncthreads();
+
+   // Perform the parallel reduction.
+   value = BlockReduce::reduce( reduction, value, threadIdx.x, storage.blockReduceStorage );
+
+   // Store the block result in the global memory.
+   if( threadIdx.x == 0 )
+      reductionResults[ blockIdx.x ] = value;
+}
+
+/* CudaScanKernelDownsweep - scan each tile of the input separately in each CUDA
+ * block and use the result of spine scan as the initial value
+ */
 template< ScanType scanType,
           int blockSize,
           int valuesPerThread,
@@ -253,14 +329,48 @@ template< ScanType scanType,
           typename OutputView,
           typename Reduction >
 __global__ void
-CudaScanKernelFirstPhase( const InputView input,
-                          OutputView output,
-                          typename InputView::IndexType begin,
-                          typename InputView::IndexType end,
-                          typename OutputView::IndexType outputBegin,
-                          Reduction reduction,
-                          typename OutputView::ValueType zero,
-                          typename OutputView::ValueType* blockResults )
+CudaScanKernelDownsweep( const InputView input,
+                         OutputView output,
+                         typename InputView::IndexType begin,
+                         typename InputView::IndexType end,
+                         typename OutputView::IndexType outputBegin,
+                         Reduction reduction,
+                         typename OutputView::ValueType zero,
+                         typename OutputView::ValueType shift,
+                         const typename OutputView::ValueType* reductionResults )
+{
+   using ValueType = typename OutputView::ValueType;
+   using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
+
+   // allocate shared memory
+   __shared__ typename TileScan::Storage storage;
+
+   // load the reduction of the previous tiles
+   shift = reduction( shift, reductionResults[ blockIdx.x ] );
+
+   // scan from input into output
+   TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, shift, storage );
+}
+
+/* CudaScanKernelParallel - scan each tile of the input separately in each CUDA
+ * block (first phase to be followed by CudaScanKernelUniformShift when there
+ * are multiple CUDA blocks).
+ */
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename OutputView,
+          typename Reduction >
+__global__ void
+CudaScanKernelParallel( const InputView input,
+                        OutputView output,
+                        typename InputView::IndexType begin,
+                        typename InputView::IndexType end,
+                        typename OutputView::IndexType outputBegin,
+                        Reduction reduction,
+                        typename OutputView::ValueType zero,
+                        typename OutputView::ValueType* blockResults )
 {
    using ValueType = typename OutputView::ValueType;
    using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
@@ -276,26 +386,33 @@ CudaScanKernelFirstPhase( const InputView input,
       blockResults[ blockIdx.x ] = value;
 }
 
+/* CudaScanKernelUniformShift - apply a uniform shift to a pre-scanned output
+ * array.
+ *
+ * \param blockResults  An array of per-block shifts coming from the first phase
+ *                      (computed by CudaScanKernelParallel)
+ * \param shift         A global shift to be applied to all elements of the
+ *                      output array.
+ */
 template< int blockSize,
           int valuesPerThread,
           typename OutputView,
           typename Reduction >
 __global__ void
-CudaScanKernelSecondPhase( OutputView output,
-                           typename OutputView::IndexType outputBegin,
-                           typename OutputView::IndexType outputEnd,
-                           Reduction reduction,
-                           int gridOffset,
-                           const typename OutputView::ValueType* blockResults,
-                           typename OutputView::ValueType shift )
+CudaScanKernelUniformShift( OutputView output,
+                            typename OutputView::IndexType outputBegin,
+                            typename OutputView::IndexType outputEnd,
+                            Reduction reduction,
+                            const typename OutputView::ValueType* blockResults,
+                            typename OutputView::ValueType shift )
 {
    // load the block result into a __shared__ variable first
    __shared__ typename OutputView::ValueType blockResult;
    if( threadIdx.x == 0 )
-      blockResult = blockResults[ gridOffset + blockIdx.x ];
+      blockResult = blockResults[ blockIdx.x ];
 
    // update the output offset for the thread
-   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelSecondPhase" );
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelUniformShift" );
    constexpr int maxElementsInBlock = blockSize * valuesPerThread;
    const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
    outputBegin += threadOffset;
@@ -318,6 +435,7 @@ CudaScanKernelSecondPhase( OutputView output,
  * \tparam valuesPerThread  Number of elements processed by each thread sequentially.
  */
 template< ScanType scanType,
+          ScanPhaseType phaseType,
           int blockSize = 256,
           // valuesPerThread should be odd to avoid shared memory bank conflicts
           int valuesPerThread = 7 >
@@ -371,6 +489,7 @@ struct CudaScanKernelLauncher
          end,
          outputBegin,
          reduction,
+         zero,
          zero );
    }
 
@@ -411,7 +530,7 @@ struct CudaScanKernelLauncher
 
          // run the kernel with just 1 block
          if( end - begin <= blockSize )
-            CudaScanKernelFirstPhase< scanType, blockSize, 1 ><<< 1, blockSize >>>
+            CudaScanKernelParallel< scanType, blockSize, 1 ><<< 1, blockSize >>>
                ( input.getConstView(),
                  output.getView(),
                  begin,
@@ -422,7 +541,7 @@ struct CudaScanKernelLauncher
                  // blockResults are shifted by 1, because the 0-th element should stay zero
                  &blockResults.getData()[ 1 ] );
          else if( end - begin <= blockSize * 3 )
-            CudaScanKernelFirstPhase< scanType, blockSize, 3 ><<< 1, blockSize >>>
+            CudaScanKernelParallel< scanType, blockSize, 3 ><<< 1, blockSize >>>
                ( input.getConstView(),
                  output.getView(),
                  begin,
@@ -433,7 +552,7 @@ struct CudaScanKernelLauncher
                  // blockResults are shifted by 1, because the 0-th element should stay zero
                  &blockResults.getData()[ 1 ] );
          else if( end - begin <= blockSize * 5 )
-            CudaScanKernelFirstPhase< scanType, blockSize, 5 ><<< 1, blockSize >>>
+            CudaScanKernelParallel< scanType, blockSize, 5 ><<< 1, blockSize >>>
                ( input.getConstView(),
                  output.getView(),
                  begin,
@@ -444,7 +563,7 @@ struct CudaScanKernelLauncher
                  // blockResults are shifted by 1, because the 0-th element should stay zero
                  &blockResults.getData()[ 1 ] );
          else
-            CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< 1, blockSize >>>
+            CudaScanKernelParallel< scanType, blockSize, valuesPerThread ><<< 1, blockSize >>>
                ( input.getConstView(),
                  output.getView(),
                  begin,
@@ -488,15 +607,30 @@ struct CudaScanKernelLauncher
             cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
 
             // run the kernel
-            CudaScanKernelFirstPhase< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
-               ( input.getConstView(),
-                 output.getView(),
-                 begin + gridOffset,
-                 begin + gridOffset + currentSize,
-                 outputBegin + gridOffset,
-                 reduction,
-                 zero,
-                 &blockResults.getData()[ gridIdx * maxGridSize() ] );
+            switch( phaseType )
+            {
+               case ScanPhaseType::WriteInFirstPhase:
+                  CudaScanKernelParallel< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       output.getView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       outputBegin + gridOffset,
+                       reduction,
+                       zero,
+                       &blockResults.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+
+               case ScanPhaseType::WriteInSecondPhase:
+                  CudaScanKernelUpsweep< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       reduction,
+                       zero,
+                       &blockResults.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+            }
          }
 
          // synchronize the null-stream after all grids
@@ -505,7 +639,7 @@ struct CudaScanKernelLauncher
 
          // blockResults now contains scan results for each block. The first phase
          // ends by computing an exclusive scan of this array.
-         CudaScanKernelLauncher< ScanType::Exclusive >::perform(
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform(
             blockResults,
             blockResults,
             0,
@@ -552,35 +686,66 @@ struct CudaScanKernelLauncher
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero )
+                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType shift )
    {
       using Index = typename InputArray::IndexType;
 
-      // compute the number of grids
-      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
-      const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
-      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-
-      // loop over all grids
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid offset and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
-         const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
-
-         // setup block and grid size
-         dim3 cudaBlockSize, cudaGridSize;
-         cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
-
-         // run the kernel
-         CudaScanKernelSecondPhase< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+      // if the input was already scanned with just one block in the first phase,
+      // it must be shifted uniformly in the second phase
+      if( end - begin <= blockSize * valuesPerThread ) {
+         CudaScanKernelUniformShift< blockSize, valuesPerThread ><<< 1, blockSize >>>
             ( output.getView(),
-              outputBegin + gridOffset,
-              outputBegin + gridOffset + currentSize,
+              outputBegin,
+              outputBegin + end - begin,
               reduction,
-              gridIdx * maxGridSize(),
               blockShifts.getData(),
-              zero );
+              shift );
+      }
+      else {
+         // compute the number of grids
+         constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+         const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
+         const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+
+         // loop over all grids
+         for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+            // compute current grid offset and size of data to be scanned
+            const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+            const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
+
+            // setup block and grid size
+            dim3 cudaBlockSize, cudaGridSize;
+            cudaBlockSize.x = blockSize;
+            cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
+
+            // run the kernel
+            switch( phaseType )
+            {
+               case ScanPhaseType::WriteInFirstPhase:
+                  CudaScanKernelUniformShift< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( output.getView(),
+                       outputBegin + gridOffset,
+                       outputBegin + gridOffset + currentSize,
+                       reduction,
+                       &blockShifts.getData()[ gridIdx * maxGridSize() ],
+                       shift );
+                  break;
+
+               case ScanPhaseType::WriteInSecondPhase:
+                  CudaScanKernelDownsweep< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       output.getView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       outputBegin + gridOffset,
+                       reduction,
+                       zero,
+                       shift,
+                       &blockShifts.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+            }
+         }
       }
 
       // synchronize the null-stream after all grids
diff --git a/src/TNL/Algorithms/detail/DistributedScan.h b/src/TNL/Algorithms/detail/DistributedScan.h
index db27948f2..25900c12d 100644
--- a/src/TNL/Algorithms/detail/DistributedScan.h
+++ b/src/TNL/Algorithms/detail/DistributedScan.h
@@ -21,7 +21,7 @@ namespace TNL {
 namespace Algorithms {
 namespace detail {
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
 struct DistributedScan
 {
    template< typename InputDistributedArray,
@@ -48,7 +48,7 @@ struct DistributedScan
          // perform first phase on the local data
          const auto inputLocalView = input.getConstLocalView();
          auto outputLocalView = output.getLocalView();
-         const auto block_results = Scan< DeviceType, Type >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, zero );
+         const auto block_results = Scan< DeviceType, Type, PhaseType >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, zero );
          const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );
 
          // exchange local results between ranks
@@ -60,11 +60,11 @@ struct DistributedScan
          MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
 
          // compute the scan of the per-rank results
-         Scan< Devices::Host, ScanType::Exclusive >::perform( rank_results, rank_results, 0, nproc, 0, reduction, zero );
+         Scan< Devices::Host, ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( rank_results, rank_results, 0, nproc, 0, reduction, zero );
 
          // perform the second phase, using the per-block and per-rank results
          const int rank = MPI::GetRank( group );
-         Scan< DeviceType, Type >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, rank_results[ rank ] );
+         Scan< DeviceType, Type, PhaseType >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, zero, rank_results[ rank ] );
       }
    }
 };
diff --git a/src/TNL/Algorithms/detail/Scan.h b/src/TNL/Algorithms/detail/Scan.h
index 872dcd79a..b966330c2 100644
--- a/src/TNL/Algorithms/detail/Scan.h
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -21,11 +21,11 @@ namespace TNL {
 namespace Algorithms {
 namespace detail {
 
-template< typename Device, ScanType Type >
+template< typename Device, ScanType Type, ScanPhaseType PhaseType = ScanPhaseType::WriteInSecondPhase >
 struct Scan;
 
-template< ScanType Type >
-struct Scan< Devices::Sequential, Type >
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Sequential, Type, PhaseType >
 {
    template< typename InputArray,
              typename OutputArray,
@@ -63,11 +63,12 @@ struct Scan< Devices::Sequential, Type >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero );
+                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType shift );
 };
 
-template< ScanType Type >
-struct Scan< Devices::Host, Type >
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Host, Type, PhaseType >
 {
    template< typename InputArray,
              typename OutputArray,
@@ -105,11 +106,12 @@ struct Scan< Devices::Host, Type >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero );
+                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType shift );
 };
 
-template< ScanType Type >
-struct Scan< Devices::Cuda, Type >
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Cuda, Type, PhaseType >
 {
    template< typename InputArray,
              typename OutputArray,
@@ -147,7 +149,8 @@ struct Scan< Devices::Cuda, Type >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero );
+                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType shift );
 };
 
 } // namespace detail
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index cda45de0e..3cce2e44e 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -27,12 +27,12 @@ namespace TNL {
 namespace Algorithms {
 namespace detail {
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 void
-Scan< Devices::Sequential, Type >::
+Scan< Devices::Sequential, Type, PhaseType >::
 perform( const InputArray& input,
          OutputArray& output,
          typename InputArray::IndexType begin,
@@ -59,12 +59,12 @@ perform( const InputArray& input,
    }
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 auto
-Scan< Devices::Sequential, Type >::
+Scan< Devices::Sequential, Type, PhaseType >::
 performFirstPhase( const InputArray& input,
                    OutputArray& output,
                    typename InputArray::IndexType begin,
@@ -80,13 +80,13 @@ performFirstPhase( const InputArray& input,
    return block_results;
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
-Scan< Devices::Sequential, Type >::
+Scan< Devices::Sequential, Type, PhaseType >::
 performSecondPhase( const InputArray& input,
                     OutputArray& output,
                     const BlockShifts& blockShifts,
@@ -94,18 +94,19 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero )
+                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType shift )
 {
    // artificial second phase - only one block, use the shift as the initial value
-   perform( input, output, begin, end, outputBegin, reduction, reduction( zero, blockShifts[ 0 ] ) );
+   perform( input, output, begin, end, outputBegin, reduction, reduction( zero, reduction( shift, blockShifts[ 0 ] ) ) );
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 void
-Scan< Devices::Host, Type >::
+Scan< Devices::Host, Type, PhaseType >::
 perform( const InputArray& input,
          OutputArray& output,
          typename InputArray::IndexType begin,
@@ -158,12 +159,12 @@ perform( const InputArray& input,
       Scan< Devices::Sequential, Type >::perform( input, output, begin, end, outputBegin, reduction, zero );
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 auto
-Scan< Devices::Host, Type >::
+Scan< Devices::Host, Type, PhaseType >::
 performFirstPhase( const InputArray& input,
                    OutputArray& output,
                    typename InputArray::IndexType begin,
@@ -212,13 +213,13 @@ performFirstPhase( const InputArray& input,
       return Scan< Devices::Sequential, Type >::performFirstPhase( input, output, begin, end, outputBegin, reduction, zero );
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
-Scan< Devices::Host, Type >::
+Scan< Devices::Host, Type, PhaseType >::
 performSecondPhase( const InputArray& input,
                     OutputArray& output,
                     const BlockShifts& blockShifts,
@@ -226,7 +227,8 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero )
+                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType shift )
 {
 #ifdef HAVE_OPENMP
    using IndexType = typename InputArray::IndexType;
@@ -250,20 +252,20 @@ performSecondPhase( const InputArray& input,
          const IndexType block_output_begin = outputBegin + block_offset;
 
          // phase 2: per-block scan using the block results as initial values
-         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, reduction( zero, blockShifts[ block_idx ] ) );
+         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, reduction( zero, reduction( shift, blockShifts[ block_idx ] ) ) );
       }
    }
    else
 #endif
-      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, zero );
+      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, zero, shift );
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 void
-Scan< Devices::Cuda, Type >::
+Scan< Devices::Cuda, Type, PhaseType >::
 perform( const InputArray& input,
          OutputArray& output,
          typename InputArray::IndexType begin,
@@ -276,7 +278,7 @@ perform( const InputArray& input,
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type >::perform(
+   detail::CudaScanKernelLauncher< Type, PhaseType >::perform(
       input,
       output,
       begin,
@@ -289,12 +291,12 @@ perform( const InputArray& input,
 #endif
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
 auto
-Scan< Devices::Cuda, Type >::
+Scan< Devices::Cuda, Type, PhaseType >::
 performFirstPhase( const InputArray& input,
                    OutputArray& output,
                    typename InputArray::IndexType begin,
@@ -310,7 +312,7 @@ performFirstPhase( const InputArray& input,
       return block_results;
    }
 
-   return detail::CudaScanKernelLauncher< Type >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase(
       input,
       output,
       begin,
@@ -323,13 +325,13 @@ performFirstPhase( const InputArray& input,
 #endif
 }
 
-template< ScanType Type >
+template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename BlockShifts,
              typename Reduction >
 void
-Scan< Devices::Cuda, Type >::
+Scan< Devices::Cuda, Type, PhaseType >::
 performSecondPhase( const InputArray& input,
                     OutputArray& output,
                     const BlockShifts& blockShifts,
@@ -337,13 +339,14 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero )
+                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType shift )
 {
 #ifdef HAVE_CUDA
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase(
       input,
       output,
       blockShifts,
@@ -351,7 +354,8 @@ performSecondPhase( const InputArray& input,
       end,
       outputBegin,
       std::forward< Reduction >( reduction ),
-      zero );
+      zero,
+      shift );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
diff --git a/src/TNL/Algorithms/detail/ScanType.h b/src/TNL/Algorithms/detail/ScanType.h
index b5721ad36..6af414436 100644
--- a/src/TNL/Algorithms/detail/ScanType.h
+++ b/src/TNL/Algorithms/detail/ScanType.h
@@ -21,6 +21,11 @@ enum class ScanType {
    Inclusive
 };
 
+enum class ScanPhaseType {
+   WriteInFirstPhase,
+   WriteInSecondPhase
+};
+
 } // namespace detail
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/distributedScan.h b/src/TNL/Algorithms/distributedScan.h
index e8f001f85..573e9ac7d 100644
--- a/src/TNL/Algorithms/distributedScan.h
+++ b/src/TNL/Algorithms/distributedScan.h
@@ -64,7 +64,8 @@ distributedInclusiveScan( const InputDistributedArray& input,
                   "The input and output arrays must have the same MPI communicator." );
    TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
                   "The input and output arrays must have the same local range on all ranks." );
-   using Scan = detail::DistributedScan< detail::ScanType::Inclusive >;
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
    Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
    output.startSynchronization();
 }
@@ -137,7 +138,8 @@ distributedExclusiveScan( const InputDistributedArray& input,
                   "The input and output arrays must have the same MPI communicator." );
    TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
                   "The input and output arrays must have the same local range on all ranks." );
-   using Scan = detail::DistributedScan< detail::ScanType::Exclusive >;
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
    Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
    output.startSynchronization();
 }
@@ -202,7 +204,7 @@ distributedInplaceInclusiveScan( DistributedArray& array,
                                  Reduction&& reduction,
                                  typename DistributedArray::ValueType zero )
 {
-   using Scan = detail::DistributedScan< detail::ScanType::Inclusive >;
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
    Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
    array.startSynchronization();
 }
@@ -265,7 +267,7 @@ distributedInplaceExclusiveScan( DistributedArray& array,
                                  Reduction&& reduction,
                                  typename DistributedArray::ValueType zero )
 {
-   using Scan = detail::DistributedScan< detail::ScanType::Exclusive >;
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
    Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
    array.startSynchronization();
 }
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
index 302f7f844..982106afc 100644
--- a/src/TNL/Algorithms/scan.h
+++ b/src/TNL/Algorithms/scan.h
@@ -77,7 +77,8 @@ inclusiveScan( const InputArray& input,
                   "The input and output arrays must have the same device type." );
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
-   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Inclusive >;
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
    Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
 }
 
@@ -163,7 +164,8 @@ exclusiveScan( const InputArray& input,
                   "The input and output arrays must have the same device type." );
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
-   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Exclusive >;
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
    Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
 }
 
@@ -238,7 +240,7 @@ inplaceInclusiveScan( Array& array,
 {
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
-   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive >;
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
    Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
 }
 
@@ -310,7 +312,7 @@ inplaceExclusiveScan( Array& array,
 {
    TNL_ASSERT_EQ( reduction( zero, zero ), zero,
                   "zero is not an idempotent value of the reduction operation" );
-   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive >;
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
    Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
 }
 
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index 0b4e73c97..22952f8b4 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -87,10 +87,14 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
       }
 #endif
    }
@@ -100,8 +104,12 @@ protected:
    {
 #ifdef HAVE_CUDA
       // skip the check for too small arrays
-      if( check_cuda_grids && array.getLocalRange().getSize() > 256 )
-         EXPECT_GT( ( CudaScanKernelLauncher< ScanType >::gridsCount() ), 1 );
+      if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) {
+         // we don't care which kernel launcher was actually used
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
+         EXPECT_GT( gridsCount, 1 );
+      }
 #endif
 
       array_host = array;
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index d57c923fa..52ea507cc 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -59,10 +59,14 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
       }
 #endif
    }
@@ -72,8 +76,12 @@ protected:
    {
 #ifdef HAVE_CUDA
       // skip the check for too small arrays
-      if( array.getSize() > 256 )
-         EXPECT_GT( ( CudaScanKernelLauncher< ScanType >::gridsCount() ), 1 );
+      if( array.getSize() > 256 ) {
+         // we don't care which kernel launcher was actually used
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
+         EXPECT_GT( gridsCount, 1 );
+      }
 #endif
 
       array_host = array;
-- 
GitLab


From 57d66051bbeb5ac48c1419ab4da4b05a5edb9c98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Jul 2021 15:46:21 +0200
Subject: [PATCH 31/52] CudaScanKernelLauncher: configuration of blockSize
 depending on the ValueType

---
 src/TNL/Algorithms/detail/CudaScanKernel.h    |  9 +++++++--
 src/TNL/Algorithms/detail/Scan.hpp            |  6 +++---
 .../Algorithms/distributedScanTest.h          | 20 +++++++++----------
 src/UnitTests/Algorithms/scanTest.h           | 20 +++++++++----------
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 4d174a1c5..41f304d46 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -436,7 +436,10 @@ CudaScanKernelUniformShift( OutputView output,
  */
 template< ScanType scanType,
           ScanPhaseType phaseType,
-          int blockSize = 256,
+          typename ValueType,
+          // use blockSize=256 for 32-bit value types, scale with sizeof(ValueType)
+          // to keep shared memory requirements constant
+          int blockSize = 256 * 4 / sizeof(ValueType),
           // valuesPerThread should be odd to avoid shared memory bank conflicts
           int valuesPerThread = 7 >
 struct CudaScanKernelLauncher
@@ -520,6 +523,7 @@ struct CudaScanKernelLauncher
                       Reduction&& reduction,
                       typename OutputArray::ValueType zero )
    {
+      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
       using Index = typename InputArray::IndexType;
 
       if( end - begin <= blockSize * valuesPerThread ) {
@@ -639,7 +643,7 @@ struct CudaScanKernelLauncher
 
          // blockResults now contains scan results for each block. The first phase
          // ends by computing an exclusive scan of this array.
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform(
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform(
             blockResults,
             blockResults,
             0,
@@ -689,6 +693,7 @@ struct CudaScanKernelLauncher
                        typename OutputArray::ValueType zero,
                        typename OutputArray::ValueType shift )
    {
+      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
       using Index = typename InputArray::IndexType;
 
       // if the input was already scanned with just one block in the first phase,
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index 3cce2e44e..e8384cf04 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -278,7 +278,7 @@ perform( const InputArray& input,
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type, PhaseType >::perform(
+   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform(
       input,
       output,
       begin,
@@ -312,7 +312,7 @@ performFirstPhase( const InputArray& input,
       return block_results;
    }
 
-   return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase(
+   return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase(
       input,
       output,
       begin,
@@ -346,7 +346,7 @@ performSecondPhase( const InputArray& input,
    if( end <= begin )
       return;
 
-   detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase(
+   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase(
       input,
       output,
       blockShifts,
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
index 22952f8b4..102f49dc6 100644
--- a/src/UnitTests/Algorithms/distributedScanTest.h
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -87,14 +87,14 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
       }
 #endif
    }
@@ -106,8 +106,8 @@ protected:
       // skip the check for too small arrays
       if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) {
          // we don't care which kernel launcher was actually used
-         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
-                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
          EXPECT_GT( gridsCount, 1 );
       }
 #endif
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index 52ea507cc..d59a7db99 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -59,14 +59,14 @@ protected:
 #ifdef HAVE_CUDA
       if( std::is_same< DeviceType, Devices::Cuda >::value )
       {
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
-         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
       }
 #endif
    }
@@ -78,8 +78,8 @@ protected:
       // skip the check for too small arrays
       if( array.getSize() > 256 ) {
          // we don't care which kernel launcher was actually used
-         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
-                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
          EXPECT_GT( gridsCount, 1 );
       }
 #endif
-- 
GitLab


From 8f8c301b5b38483748730c2629e9da9ec6d2542e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Jul 2021 16:22:39 +0200
Subject: [PATCH 32/52] Added more scan operations to the BLAS benchmark

---
 src/Benchmarks/BLAS/vector-operations.h | 128 +++++++++++++++++++++---
 1 file changed, 116 insertions(+), 12 deletions(-)

diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index fc5f1b29e..3391f23fa 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include <stdlib.h> // srand48
+#include <numeric>  // std::partial_sum
 
 #include "../Benchmarks.h"
 
@@ -40,6 +41,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
 {
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
    using CudaVector =  Containers::Vector< Real, Devices::Cuda, Index >;
+   using SequentialView = Containers::VectorView< Real, Devices::Sequential, Index >;
    using HostView = Containers::VectorView< Real, Devices::Host, Index >;
    using CudaView =  Containers::VectorView< Real, Devices::Cuda, Index >;
 
@@ -565,31 +567,133 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
 
    ////
-   // Inclusive scan
-   auto inclusiveScanHost = [&]() {
+   // Inplace inclusive scan
+   auto inplaceInclusiveScanHost = [&]() {
       Algorithms::inplaceInclusiveScan( hostVector );
    };
-   benchmark.setOperation( "inclusive scan", 2 * datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", inclusiveScanHost );
+   auto inplaceInclusiveScanSequential = [&]() {
+      SequentialView view;
+      view.bind( hostVector.getData(), hostVector.getSize() );
+      Algorithms::inplaceInclusiveScan( view );
+   };
+   auto inplaceInclusiveScanSTL = [&]() {
+      std::partial_sum( hostVector.getData(), hostVector.getData() + hostVector.getSize(), hostVector.getData() );
+   };
+   benchmark.setOperation( "inclusive scan (inplace)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inplaceInclusiveScanHost );
+   benchmark.time< Devices::Sequential >( reset1, "CPU sequential", inplaceInclusiveScanSequential );
+   benchmark.time< Devices::Sequential >( reset1, "CPU std::partial_sum", inplaceInclusiveScanSTL );
+   // TODO: there are also `std::inclusive_scan` and `std::exclusive_scan` since C++17 which are parallel,
+   // add them to the benchmark when we use C++17
 #ifdef HAVE_CUDA
-   auto inclusiveScanCuda = [&]() {
+   auto inplaceInclusiveScanCuda = [&]() {
       Algorithms::inplaceInclusiveScan( deviceVector );
    };
-   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusiveScanCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inplaceInclusiveScanCuda );
+#endif
+
+   ////
+   // Inclusive scan of one vector
+   auto inclusiveScanOneVectorHost = [&]() {
+      Algorithms::inclusiveScan( hostVector, hostVector2 );
+   };
+   benchmark.setOperation( "inclusive scan (1 vector)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanOneVectorHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanOneVectorCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector, deviceVector2 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanOneVectorCuda );
+#endif
+
+   ////
+   // Inclusive scan of two vectors
+   auto inclusiveScanTwoVectorsHost = [&]() {
+      Algorithms::inclusiveScan( hostVector + hostVector2, hostVector3 );
+   };
+   benchmark.setOperation( "inclusive scan (2 vectors)", 3 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanTwoVectorsHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanTwoVectorsCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector + deviceVector2, deviceVector3 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanTwoVectorsCuda );
+#endif
+
+   ////
+   // Inclusive scan of three vectors
+   auto inclusiveScanThreeVectorsHost = [&]() {
+      Algorithms::inclusiveScan( hostVector + hostVector2 + hostVector3, hostVector4 );
+   };
+   benchmark.setOperation( "inclusive scan (3 vectors)", 4 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanThreeVectorsHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanThreeVectorsCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector + deviceVector2 + deviceVector3, deviceVector4 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanThreeVectorsCuda );
 #endif
 
    ////
-   // Exclusive scan
-   auto exclusiveScanHost = [&]() {
+   // Inplace exclusive scan
+   auto inplaceExclusiveScanHost = [&]() {
       Algorithms::inplaceExclusiveScan( hostVector );
    };
-   benchmark.setOperation( "exclusive scan", 2 * datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", exclusiveScanHost );
+   auto inplaceExclusiveScanSequential = [&]() {
+      SequentialView view;
+      view.bind( hostVector.getData(), hostVector.getSize() );
+      Algorithms::inplaceExclusiveScan( view );
+   };
+   benchmark.setOperation( "exclusive scan (inplace)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inplaceExclusiveScanHost );
+   benchmark.time< Devices::Sequential >( reset1, "CPU sequential", inplaceExclusiveScanSequential );
 #ifdef HAVE_CUDA
-   auto exclusiveScanCuda = [&]() {
+   auto inplaceExclusiveScanCuda = [&]() {
       Algorithms::inplaceExclusiveScan( deviceVector );
    };
-   benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusiveScanCuda );
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inplaceExclusiveScanCuda );
+#endif
+
+   ////
+   // Exclusive scan of one vector
+   auto exclusiveScanOneVectorHost = [&]() {
+      Algorithms::exclusiveScan( hostVector, hostVector2 );
+   };
+   benchmark.setOperation( "exclusive scan (1 vector)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanOneVectorHost );
+#ifdef HAVE_CUDA
+   auto exclusiveScanOneVectorCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector, deviceVector2 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanOneVectorCuda );
+#endif
+
+   ////
+   // Exclusive scan of two vectors
+   auto exclusiveScanTwoVectorsHost = [&]() {
+      Algorithms::exclusiveScan( hostVector + hostVector2, hostVector3 );
+   };
+   benchmark.setOperation( "exclusive scan (2 vectors)", 3 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanTwoVectorsHost );
+#ifdef HAVE_CUDA
+   auto exclusiveScanTwoVectorsCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector + deviceVector2, deviceVector3 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanTwoVectorsCuda );
+#endif
+
+   ////
+   // Exclusive scan of three vectors
+   auto exclusiveScanThreeVectorsHost = [&]() {
+      Algorithms::exclusiveScan( hostVector + hostVector2 + hostVector3, hostVector4 );
+   };
+   benchmark.setOperation( "exclusive scan (3 vectors)", 4 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanThreeVectorsHost );
+#ifdef HAVE_CUDA
+   auto exclusiveScanThreeVectorsCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector + deviceVector2 + deviceVector3, deviceVector4 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanThreeVectorsCuda );
 #endif
 
 #ifdef HAVE_CUDA
-- 
GitLab


From 4743a5658bb60d4414885c19aa850a661e25b6ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 22 Jul 2021 20:11:11 +0200
Subject: [PATCH 33/52] Optimized parallel OpenMP scan algorithm for expensive
 inputs

This adds back the original approach (prescan + uniform shift) which
was removed too early.
---
 src/TNL/Algorithms/detail/Scan.h   |   3 +-
 src/TNL/Algorithms/detail/Scan.hpp | 155 ++++++++++++++++++++++++-----
 2 files changed, 132 insertions(+), 26 deletions(-)

diff --git a/src/TNL/Algorithms/detail/Scan.h b/src/TNL/Algorithms/detail/Scan.h
index b966330c2..c97a3f8fe 100644
--- a/src/TNL/Algorithms/detail/Scan.h
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -30,7 +30,8 @@ struct Scan< Devices::Sequential, Type, PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
-   static void
+   // returns the last value of inclusive scan (reduction of the whole input)
+   static typename OutputArray::ValueType
    perform( const InputArray& input,
             OutputArray& output,
             typename InputArray::IndexType begin,
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index e8384cf04..4329294da 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -31,7 +31,7 @@ template< ScanType Type, ScanPhaseType PhaseType >
    template< typename InputArray,
              typename OutputArray,
              typename Reduction >
-void
+typename OutputArray::ValueType
 Scan< Devices::Sequential, Type, PhaseType >::
 perform( const InputArray& input,
          OutputArray& output,
@@ -57,6 +57,8 @@ perform( const InputArray& input,
          aux = reduction( aux, x );
       }
    }
+   // return the last value of inclusive scan (reduction of the whole input)
+   return aux;
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -73,11 +75,32 @@ performFirstPhase( const InputArray& input,
                    Reduction&& reduction,
                    typename OutputArray::ValueType zero )
 {
-   // artificial first phase - only reduce the block
-   Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
-   block_results[ 0 ] = zero;
-   block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, zero );
-   return block_results;
+   if( end <= begin ) {
+      Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 1 );
+      block_results.setValue( zero );
+      return block_results;
+   }
+
+   switch( PhaseType )
+   {
+      case ScanPhaseType::WriteInFirstPhase:
+      {
+         // artificial second phase - pre-scan the block
+         Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
+         block_results[ 0 ] = zero;
+         block_results[ 1 ] = perform( input, output, begin, end, outputBegin, reduction, zero );
+         return block_results;
+      }
+
+      case ScanPhaseType::WriteInSecondPhase:
+      {
+         // artificial first phase - only reduce the block
+         Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
+         block_results[ 0 ] = zero;
+         block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, zero );
+         return block_results;
+      }
+   };
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -97,8 +120,25 @@ performSecondPhase( const InputArray& input,
                     typename OutputArray::ValueType zero,
                     typename OutputArray::ValueType shift )
 {
-   // artificial second phase - only one block, use the shift as the initial value
-   perform( input, output, begin, end, outputBegin, reduction, reduction( zero, reduction( shift, blockShifts[ 0 ] ) ) );
+   switch( PhaseType )
+   {
+      case ScanPhaseType::WriteInFirstPhase:
+      {
+         // artificial second phase - uniform shift of a pre-scanned block
+         shift = reduction( shift, blockShifts[ 0 ] );
+         typename InputArray::IndexType outputEnd = outputBegin + end - begin;
+         for( typename InputArray::IndexType i = outputBegin; i < outputEnd; i++ )
+            output[ i ] = reduction( output[ i ], shift );
+         break;
+      }
+
+      case ScanPhaseType::WriteInSecondPhase:
+      {
+         // artificial second phase - only one block, use the shift as the initial value
+         perform( input, output, begin, end, outputBegin, reduction, reduction( shift, blockShifts[ 0 ] ) );
+         break;
+      }
+   }
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -139,19 +179,49 @@ perform( const InputArray& input,
          const IndexType block_end = TNL::min( block_begin + block_size, end );
          const IndexType block_output_begin = outputBegin + block_offset;
 
-         // step 1: per-block reductions, write the result into the buffer
-         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
-
-         #pragma omp barrier
-
-         // step 2: scan the block results
-         #pragma omp single
+         switch( PhaseType )
          {
-            Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // step 1: pre-scan the block and save the result of the block reduction
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, zero );
+
+               #pragma omp barrier
+
+               // step 2: scan the block results
+               #pragma omp single
+               {
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+               }
+
+               // step 3: uniform shift of the pre-scanned block
+               const ValueType block_shift = block_results[ block_idx ];
+               const IndexType block_output_end = block_output_begin + block_end - block_begin;
+               for( IndexType i = block_output_begin; i < block_output_end; i++ )
+                  output[ i ] = reduction( output[ i ], block_shift );
+
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // step 1: per-block reductions, write the result into the buffer
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
+
+               #pragma omp barrier
+
+               // step 2: scan the block results
+               #pragma omp single
+               {
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+               }
+
+               // step 3: per-block scan using the block results as initial values
+               Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_results[ block_idx ] );
+
+               break;
+            }
          }
-
-         // step 3: per-block scan using the block results as initial values
-         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_results[ block_idx ] );
       }
    }
    else
@@ -195,14 +265,30 @@ performFirstPhase( const InputArray& input,
       #pragma omp parallel num_threads(threads)
       {
          const int block_idx = omp_get_thread_num();
-         const IndexType block_begin = begin + block_idx * block_size;
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
          const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
 
-         // step 1: per-block reductions, write the result into the buffer
-         block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
+         switch( PhaseType )
+         {
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // pre-scan the block, write the result of the block reduction into the buffer
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, zero );
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // upsweep: per-block reductions, write the result into the buffer
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
+               break;
+            }
+         }
       }
 
-      // step 2: scan the block results
+      // spine step: scan the block results
       Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
 
       // block_results now contains shift values for each block - to be used in the second phase
@@ -231,6 +317,7 @@ performSecondPhase( const InputArray& input,
                     typename OutputArray::ValueType shift )
 {
 #ifdef HAVE_OPENMP
+   using ValueType = typename OutputArray::ValueType;
    using IndexType = typename InputArray::IndexType;
 
    if( end <= begin )
@@ -251,8 +338,26 @@ performSecondPhase( const InputArray& input,
          const IndexType block_end = TNL::min( block_begin + block_size, end );
          const IndexType block_output_begin = outputBegin + block_offset;
 
-         // phase 2: per-block scan using the block results as initial values
-         Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, reduction( zero, reduction( shift, blockShifts[ block_idx ] ) ) );
+         const ValueType block_shift = reduction( shift, blockShifts[ block_idx ] );
+
+         switch( PhaseType )
+         {
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // uniform shift of a pre-scanned block
+               const IndexType block_output_end = block_output_begin + block_end - block_begin;
+               for( IndexType i = block_output_begin; i < block_output_end; i++ )
+                  output[ i ] = reduction( output[ i ], block_shift );
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // downsweep: per-block scan using the block results as initial values
+               Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_shift );
+               break;
+            }
+         }
       }
    }
    else
-- 
GitLab


From d97cea88cb6e6c7975f90091126dc62add22a89c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 23 Jul 2021 13:24:59 +0200
Subject: [PATCH 34/52] Added default reduction functional (TNL::Plus) to
 reduce

---
 src/TNL/Algorithms/reduce.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index 877fb2c0b..2100ff827 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -118,18 +118,18 @@ Result reduce( const Index begin,
 template< typename Device,
           typename Index,
           typename Fetch,
-          typename Reduction >
+          typename Reduction = TNL::Plus >
 auto reduce( const Index begin,
              const Index end,
              Fetch&& fetch,
-             Reduction&& reduction )
+             Reduction&& reduction = TNL::Plus{} )
 {
    using Result = std::decay_t< decltype( fetch( 0 ) ) >;
-   return detail::Reduction< Device >::reduce( begin,
-                                               end,
-                                               std::forward< Fetch >( fetch ),
-                                               std::forward< Reduction >( reduction ),
-                                               reduction.template getIdempotent< Result >() );
+   return reduce< Device >( begin,
+                            end,
+                            std::forward< Fetch >( fetch ),
+                            std::forward< Reduction >( reduction ),
+                            reduction.template getIdempotent< Result >() );
 }
 
 /**
@@ -153,7 +153,7 @@ auto reduce( const Index begin,
  * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
  * \param zero is the idempotent element for the reduction operation, i.e. element which
  *             does not change the result of the reduction.
- * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
  *         is the element position and `pair.second` is the reduction result.
  *
  * The `fetch` lambda function takes one argument which is index of the element to be fetched:
@@ -218,7 +218,7 @@ reduceWithArgument( const Index begin,
  * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
  * \param zero is the idempotent element for the reduction operation, i.e. element which
  *             does not change the result of the reduction.
- * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
  *         is the element position and `pair.second` is the reduction result.
  *
  * The `fetch` lambda function takes one argument which is index of the element to be fetched:
@@ -252,11 +252,11 @@ reduceWithArgument( const Index begin,
                     Reduction&& reduction )
 {
    using Result = std::decay_t< decltype( fetch( 0 ) ) >;
-   return detail::Reduction< Device >::reduceWithArgument( begin,
-                                                           end,
-                                                           std::forward< Fetch >( fetch ),
-                                                           std::forward< Reduction >( reduction ),
-                                                           reduction.template getIdempotent< Result >() );
+   return reduceWithArgument< Device >( begin,
+                                        end,
+                                        std::forward< Fetch >( fetch ),
+                                        std::forward< Reduction >( reduction ),
+                                        reduction.template getIdempotent< Result >() );
 }
 
 } // namespace Algorithms
-- 
GitLab


From a4bddfff94233d228825367899b1187994124bc7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 23 Jul 2021 14:43:30 +0200
Subject: [PATCH 35/52] Added definition of ValueType to expression templates

---
 .../reduceArrayExample.cpp}                   |  0
 .../reduceArrayExample.cu}                    |  0
 .../ArrayViewExample_reduceElements.cpp       | 45 -------------------
 .../ArrayViewExample_reduceElements.cu        |  1 -
 .../DistributedExpressionTemplates.h          |  4 ++
 .../Expressions/ExpressionTemplates.h         |  4 ++
 .../Expressions/StaticExpressionTemplates.h   |  4 ++
 7 files changed, 12 insertions(+), 46 deletions(-)
 rename Documentation/Examples/{Containers/ArrayExample_reduceElements.cpp => Algorithms/reduceArrayExample.cpp} (100%)
 rename Documentation/Examples/{Containers/ArrayExample_reduceElements.cu => Algorithms/reduceArrayExample.cu} (100%)
 delete mode 100644 Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp
 delete mode 120000 Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu

diff --git a/Documentation/Examples/Containers/ArrayExample_reduceElements.cpp b/Documentation/Examples/Algorithms/reduceArrayExample.cpp
similarity index 100%
rename from Documentation/Examples/Containers/ArrayExample_reduceElements.cpp
rename to Documentation/Examples/Algorithms/reduceArrayExample.cpp
diff --git a/Documentation/Examples/Containers/ArrayExample_reduceElements.cu b/Documentation/Examples/Algorithms/reduceArrayExample.cu
similarity index 100%
rename from Documentation/Examples/Containers/ArrayExample_reduceElements.cu
rename to Documentation/Examples/Algorithms/reduceArrayExample.cu
diff --git a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp b/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp
deleted file mode 100644
index 1357ac8d0..000000000
--- a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <iostream>
-#include <functional>
-#include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
-
-using namespace TNL;
-
-template< typename Device >
-void reduceElementsExample()
-{
-   /****
-    * Create new arrays
-    */
-   const int size = 10;
-   Containers::Array< float, Device > a( size );
-   auto a_view = a.getView();
-
-   /****
-    * Initiate the elements of array `a`
-    */
-   a_view.forAllElements( [] __cuda_callable__ ( int i, float& value ) { value = i; } );
-
-   /****
-    * Sum all elements of array `a`
-    */
-   auto fetch = [=] __cuda_callable__ ( int i, float& value ) { return value; };
-   auto sum = a_view.reduceEachElement( fetch, std::plus<>{}, 0.0 );
-
-   /****
-    * Print the results
-    */
-   std::cout << " a = " << a << std::endl;
-   std::cout << " sum = " << sum << std::endl;
-}
-
-int main( int argc, char* argv[] )
-{
-   std::cout << "Running example on the host system: " << std::endl;
-   reduceElementsExample< Devices::Host >();
-
-#ifdef HAVE_CUDA
-   std::cout << "Running example on the CUDA device: " << std::endl;
-   reduceElementsExample< Devices::Cuda >();
-#endif
-}
diff --git a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu b/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu
deleted file mode 120000
index 220efb6f8..000000000
--- a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu
+++ /dev/null
@@ -1 +0,0 @@
-ArrayViewExample_reduceElements.cpp
\ No newline at end of file
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index e257399f6..093547d0b 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -57,6 +57,7 @@ template< typename T1,
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
@@ -155,6 +156,7 @@ template< typename T1,
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
@@ -236,6 +238,7 @@ template< typename T1,
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
    using LocalRangeType = typename T2::LocalRangeType;
@@ -318,6 +321,7 @@ template< typename T1,
 struct DistributedUnaryExpressionTemplate
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 29ea9f013..6e9ae794e 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -59,6 +59,7 @@ template< typename T1,
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -116,6 +117,7 @@ template< typename T1,
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -165,6 +167,7 @@ template< typename T1,
 struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -215,6 +218,7 @@ template< typename T1,
 struct UnaryExpressionTemplate
 {
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = UnaryExpressionTemplate;
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index da2c8cdd2..3709b5630 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -60,6 +60,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
 {
    using VectorOperandType = T1;
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "Left-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -114,6 +115,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
 {
    using VectorOperandType = T1;
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "Left-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -162,6 +164,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, Ve
 {
    using VectorOperandType = T2;
    using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T2 >::value,
                   "Right-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -211,6 +214,7 @@ struct StaticUnaryExpressionTemplate
 {
    using VectorOperandType = T1;
    using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "The operand of static expression is not static, i.e. based on static vector." );
-- 
GitLab


From 9ce0b16973d9c385a45d9ac949dab0962b5589c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 23 Jul 2021 15:24:47 +0200
Subject: [PATCH 36/52] Removed reduction methods from Array and ArrayView,
 instead added overloads of reduce and reduceWithArgument for arrays/views

Plain functions are much more flexible than methods. The methods were
also violating the open-closed principle:
https://en.wikipedia.org/wiki/Open%E2%80%93closed_principle
---
 .../Examples/Algorithms/CMakeLists.txt        |  28 ++-
 .../Algorithms/reduceArrayExample.cpp         |  21 +-
 .../Examples/Algorithms/reduceArrayExample.cu |   2 +-
 .../reduceWithArgumentArrayExample.cpp        |  41 ++++
 .../reduceWithArgumentArrayExample.cu         |   1 +
 .../Examples/Containers/CMakeLists.txt        |   2 -
 .../Containers/DistributedArrayExample.cu     |   1 +
 src/TNL/Algorithms/reduce.h                   | 184 ++++++++++++++++--
 src/TNL/Containers/Array.h                    | 148 --------------
 src/TNL/Containers/Array.hpp                  |  56 ------
 src/TNL/Containers/ArrayView.h                | 148 --------------
 src/TNL/Containers/ArrayView.hpp              |  62 ------
 src/UnitTests/Containers/VectorTest.h         |  25 ---
 13 files changed, 244 insertions(+), 475 deletions(-)
 create mode 100644 Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp
 create mode 120000 Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu
 create mode 120000 Documentation/Examples/Containers/DistributedArrayExample.cu

diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 5ffb91b16..982b9c06f 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,21 +1,39 @@
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
    CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+
    CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+
    CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
+
+   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+
+   CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
+
+   CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
 ELSE()
-   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
    ADD_EXECUTABLE( SortingExample SortingExample.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+
    ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+
    ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
+
+   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+
+   ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
+
+   ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
 ENDIF()
 
 ADD_EXECUTABLE(staticForExample staticForExample.cpp)
@@ -29,6 +47,8 @@ ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    SortingExample2.out
    SortingExample3.out
    ParallelForExample.out
+   reduceArrayExample.out
+   reduceWithArgumentArrayExample.out
    unrolledForExample.out
    staticForExample.out
 )
diff --git a/Documentation/Examples/Algorithms/reduceArrayExample.cpp b/Documentation/Examples/Algorithms/reduceArrayExample.cpp
index bdf943732..5af0a2436 100644
--- a/Documentation/Examples/Algorithms/reduceArrayExample.cpp
+++ b/Documentation/Examples/Algorithms/reduceArrayExample.cpp
@@ -1,12 +1,10 @@
-#include <iostream>
-#include <functional>
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 
 template< typename Device >
-void reduceElementsExample()
+void reduceArrayExample()
 {
    /****
     * Create new arrays
@@ -22,23 +20,28 @@ void reduceElementsExample()
    /****
     * Sum all elements of array `a`
     */
-   auto fetch = [=] __cuda_callable__ ( int i, float& value ) { return value; };
-   auto sum = a.reduceEachElement( fetch, std::plus<>{}, 0.0 );
+   float sum_total = Algorithms::reduce( a, TNL::Plus{} );
+
+   /****
+    * Sum last 5 elements of array `a`
+    */
+   float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
 
    /****
     * Print the results
     */
    std::cout << " a = " << a << std::endl;
-   std::cout << " sum = " << sum << std::endl;
+   std::cout << " sum of all elements = " << sum_total << std::endl;
+   std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
 }
 
 int main( int argc, char* argv[] )
 {
    std::cout << "Running example on the host system: " << std::endl;
-   reduceElementsExample< Devices::Host >();
+   reduceArrayExample< Devices::Host >();
 
 #ifdef HAVE_CUDA
    std::cout << "Running example on the CUDA device: " << std::endl;
-   reduceElementsExample< Devices::Cuda >();
+   reduceArrayExample< Devices::Cuda >();
 #endif
 }
diff --git a/Documentation/Examples/Algorithms/reduceArrayExample.cu b/Documentation/Examples/Algorithms/reduceArrayExample.cu
index 466460f2f..87a4a2310 120000
--- a/Documentation/Examples/Algorithms/reduceArrayExample.cu
+++ b/Documentation/Examples/Algorithms/reduceArrayExample.cu
@@ -1 +1 @@
-ArrayExample_reduceElements.cpp
\ No newline at end of file
+reduceArrayExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp
new file mode 100644
index 000000000..c29764ad3
--- /dev/null
+++ b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp
@@ -0,0 +1,41 @@
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/reduce.h>
+
+using namespace TNL;
+
+template< typename Device >
+void reduceArrayExample()
+{
+   /****
+    * Create new arrays
+    */
+   const int size = 10;
+   Containers::Vector< float, Device > a( size );
+
+   /****
+    * Initiate the elements of array `a`
+    */
+   a.forAllElements( [] __cuda_callable__ ( int i, float& value ) { value = 3 - i; } );
+
+   /****
+    * Reduce all elements of array `a`
+    */
+   std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
+
+   /****
+    * Print the results
+    */
+   std::cout << " a = " << a << std::endl;
+   std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Running example on the host system: " << std::endl;
+   reduceArrayExample< Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Running example on the CUDA device: " << std::endl;
+   reduceArrayExample< Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu
new file mode 120000
index 000000000..d5721a03a
--- /dev/null
+++ b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu
@@ -0,0 +1 @@
+reduceWithArgumentArrayExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Containers/CMakeLists.txt b/Documentation/Examples/Containers/CMakeLists.txt
index e85546a45..eb9f8d30a 100644
--- a/Documentation/Examples/Containers/CMakeLists.txt
+++ b/Documentation/Examples/Containers/CMakeLists.txt
@@ -1,10 +1,8 @@
 set( COMMON_EXAMPLES
          ArrayExample
          ArrayExample_forElements
-         ArrayExample_reduceElements
          ArrayViewExample
          ArrayViewExample_forElements
-         ArrayViewExample_reduceElements
          VectorExample
 )
 
diff --git a/Documentation/Examples/Containers/DistributedArrayExample.cu b/Documentation/Examples/Containers/DistributedArrayExample.cu
new file mode 120000
index 000000000..e4e614621
--- /dev/null
+++ b/Documentation/Examples/Containers/DistributedArrayExample.cu
@@ -0,0 +1 @@
+DistributedArrayExample.cpp
\ No newline at end of file
diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index 2100ff827..96680b550 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -21,12 +21,15 @@ namespace TNL {
 namespace Algorithms {
 
 /**
- * \brief \e reduce implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
+ * \brief \e reduce implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern))
+ * for vectors and arrays.
  *
- * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
- * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
- * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
- * position of the smallest or the largest element, reduction with argument can be used.
+ * Reduction can be used for operations having one or more vectors (or arrays)
+ * elements as input and returning one number (or element) as output. Some
+ * examples of such operations can be vectors/arrays comparison, vector norm,
+ * scalar product of two vectors or computing minimum or maximum. If one needs
+ * to know even the position of the smallest or the largest element, the
+ * function \ref reduceWithArgument can be used.
  *
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
@@ -34,7 +37,8 @@ namespace Algorithms {
  * \tparam Fetch is a lambda function for fetching the input data.
  * \tparam Reduction is a lambda function performing the reduction.
  *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
@@ -83,17 +87,19 @@ Result reduce( const Index begin,
 }
 
 /**
- * \brief Variant of \ref TNL::Algorithms::reduce with functional instead of reduction lambda function.
+ * \brief Variant of \ref reduce with functional instead of reduction lambda function.
  *
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
  * \tparam Fetch is a lambda function for fetching the input data.
  * \tparam Reduction is a functional performing the reduction.
  *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
- * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
- *    \ref TNL::LogicalOr, \ref TNL::BitAnd or \ref TNL::BitOr.
+ * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies,
+ * \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd, \ref TNL::LogicalOr,
+ * \ref TNL::BitAnd or \ref TNL::BitOr. \ref TNL::Plus is used by default.
  *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
@@ -133,10 +139,81 @@ auto reduce( const Index begin,
 }
 
 /**
- * \brief Variant of \ref TNL::Algorithms::reduce returning also a position of an element of interest.
+ * \brief Variant of \ref reduce for arrays, views and compatible objects.
  *
- * For example in case of computing minimal or maximal element in array/vector,
- * the position of the element having given value can be obtained. The use of this method
+ * The referenced \ref reduce function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and `zero` as the idempotent element of the reduction.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction,
+          typename Result >
+auto reduce( const Array& array,
+             Reduction&& reduction,
+             Result zero )
+{
+   return reduce< Device >( (typename Array::IndexType) 0,
+                            array.getSize(),
+                            array.getConstView(),
+                            std::forward< Reduction >( reduction ),
+                            zero );
+}
+
+/**
+ * \brief Variant of \ref reduce for arrays, views and compatible objects.
+ *
+ * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies,
+ * \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd, \ref TNL::LogicalOr,
+ * \ref TNL::BitAnd or \ref TNL::BitOr. \ref TNL::Plus is used by default.
+ *
+ * The referenced \ref reduce function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and the idempotent element obtained from the reduction functional object.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction = TNL::Plus >
+auto reduce( const Array& array,
+             Reduction&& reduction = TNL::Plus{} )
+{
+   using ValueType = typename Array::ValueType;
+   return reduce< Array, Device >( array,
+                                   std::forward< Reduction >( reduction ),
+                                   reduction.template getIdempotent< ValueType >() );
+}
+
+/**
+ * \brief Variant of \ref reduce returning also the position of the element of interest.
+ *
+ * For example, in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. This method
  * is, however, more flexible.
  *
  * \tparam Device parameter says on what device the reduction is gonna be performed.
@@ -145,7 +222,8 @@ auto reduce( const Index begin,
  * \tparam Reduction is a lambda function performing the reduction.
  * \tparam Fetch is a lambda function for fetching the input data.
  *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
  * \param begin defines range [begin, end) of indexes which will be used for the reduction.
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
@@ -196,11 +274,7 @@ reduceWithArgument( const Index begin,
 }
 
 /**
- * \brief Variant of \ref TNL::Algorithms::reduceWithArgument with functional instead of reduction lambda function.
- *
- * For example in case of computing minimal or maximal element in array/vector,
- * the position of the element having given value can be obtained. The use of this method
- * is, however, more flexible.
+ * \brief Variant of \ref reduceWithArgument with functional instead of reduction lambda function.
  *
  * \tparam Device parameter says on what device the reduction is gonna be performed.
  * \tparam Index is a type for indexing.
@@ -208,7 +282,8 @@ reduceWithArgument( const Index begin,
  * \tparam Reduction is a functional performing the reduction.
  * \tparam Fetch is a lambda function for fetching the input data.
  *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
  *
  * \e Reduction can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
  *
@@ -259,5 +334,74 @@ reduceWithArgument( const Index begin,
                                         reduction.template getIdempotent< Result >() );
 }
 
+/**
+ * \brief Variant of \ref reduceWithArgument for arrays, views and compatible objects.
+ *
+ * The referenced \ref reduceWithArgument function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and `zero` as the idempotent element of the reduction.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceWithArgumentArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceWithArgumentArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction,
+          typename Result >
+auto reduceWithArgument( const Array& array,
+                         Reduction&& reduction,
+                         Result zero )
+{
+   return reduceWithArgument< Device >( (typename Array::IndexType) 0,
+                                        array.getSize(),
+                                        array.getConstView(),
+                                        std::forward< Reduction >( reduction ),
+                                        zero );
+}
+
+/**
+ * \brief Variant of \ref reduceWithArgument for arrays, views and compatible objects.
+ *
+ * \e Reduction can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ *
+ * The referenced \ref reduceWithArgument function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and the idempotent element obtained from the reduction functional object.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceWithArgumentArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceWithArgumentArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction >
+auto reduceWithArgument( const Array& array,
+                         Reduction&& reduction )
+{
+   using ValueType = typename Array::ValueType;
+   return reduceWithArgument< Array, Device >( array,
+                                               std::forward< Reduction >( reduction ),
+                                               reduction.template getIdempotent< ValueType >() );
+}
+
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 3bbd5efb0..c33e1283e 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -722,154 +722,6 @@ class Array
       template< typename Function >
       void forAllElements( Function&& f ) const;
 
-       /**
-        * \brief Computes reduction with array elements on interval [ \e begin, \e end).
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-        * \param end defines range [begin, end) of indexes which will be used for the reduction.
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-       /**
-        * \brief Computes reduction with array elements on interval [ \e begin, \e end) for constant instances.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-        * \param end defines range [begin, end) of indexes which will be used for the reduction.
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-       /**
-        * \brief Computes reduction with all array elements.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-       /**
-        * \brief Computes reduction with all array elements for constant instances.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
       /**
        * \brief Checks if there is an element with value \e v.
        *
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index fc3e7193b..f6a25925f 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -756,62 +756,6 @@ forAllElements( Function&& f ) const
    view.forAllElements( f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->getView().reduceElements( begin, end, fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->getConstView().reduceElements( begin, end, fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->getView().reduceEachElement( fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->getConstView().reduceEachElement( fetch, reduce, zero );
-}
-
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 3716ce01e..46fc5c37a 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -541,154 +541,6 @@ public:
    template< typename Function >
    void forAllElements( Function&& f ) const;
 
-   /**
-    * \brief Computes reduction with array view elements on interval [ \e begin, \e end).
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-   /**
-    * \brief Computes reduction with array view elements on interval [ \e begin, \e end) for constant instances.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-   /**
-    * \brief Computes reduction with all array view elements.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-   /**
-    * \brief Computes reduction with all array view elements for constant instances.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
    /**
     * \brief Checks if there is an element with value \e v.
     *
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index fd9f95297..0e2bb7b77 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -396,68 +396,6 @@ forAllElements( Function&& f ) const
    this->forElements( 0, this->getSize(), f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   if( ! this->data )
-      return zero;
-
-   ValueType* d = this->getData();
-   auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   if( ! this->data )
-      return;
-
-   const ValueType* d = this->getData();
-   auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
-}
-
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/UnitTests/Containers/VectorTest.h b/src/UnitTests/Containers/VectorTest.h
index 7c54ab578..ea1676b67 100644
--- a/src/UnitTests/Containers/VectorTest.h
+++ b/src/UnitTests/Containers/VectorTest.h
@@ -80,31 +80,6 @@ TYPED_TEST( VectorTest, constructors )
 
 }
 
-// test must be in a plain function because nvcc sucks (extended lambdas are
-// not allowed to be defined in protected class member functions)
-template< typename VectorType >
-void testVectorReduceElements()
-{
-   using IndexType = typename VectorType::IndexType;
-   using ValueType = typename VectorType::ValueType;
-
-   VectorType a( 10 );
-   a.forAllElements( [=] __cuda_callable__ ( IndexType i, ValueType& v ) mutable { v = 1; } );
-   auto fetch = [] __cuda_callable__ ( IndexType i, ValueType& v ) -> ValueType { return v; };
-   auto reduce = [] __cuda_callable__ ( const ValueType v1, const ValueType v2 ) { return v1 + v2; };
-   EXPECT_EQ( a.reduceEachElement( fetch, reduce, ( ValueType ) 0.0 ),
-              a.getSize() );
-
-   const VectorType b( a );
-   auto const_fetch = [] __cuda_callable__ ( IndexType i, const ValueType& v ) -> ValueType { return v; };
-   EXPECT_EQ( b.reduceEachElement( const_fetch, reduce, ( ValueType ) 0.0 ),
-              b.getSize() );
-}
-TYPED_TEST( VectorTest, reduceElements )
-{
-   testVectorReduceElements< typename TestFixture::VectorType >();
-}
-
 TEST( VectorSpecialCasesTest, defaultConstructors )
 {
    #ifdef HAVE_CUDA
-- 
GitLab


From 0d329226bfc5158bd91a1612dca1243968243eea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 23 Jul 2021 23:22:43 +0200
Subject: [PATCH 37/52] Reduction: renamed zero and idempotent to identity

---
 src/TNL/Algorithms/Multireduction.h           |  18 ++-
 src/TNL/Algorithms/Multireduction.hpp         |  20 +--
 src/TNL/Algorithms/SegmentedScan.h            |  21 +--
 src/TNL/Algorithms/SegmentedScan.hpp          |  14 +-
 .../detail/CudaMultireductionKernel.h         |  24 +--
 .../Algorithms/detail/CudaReductionKernel.h   |  60 ++++----
 src/TNL/Algorithms/detail/CudaScanKernel.h    |  98 +++++++------
 src/TNL/Algorithms/detail/DistributedScan.h   |   8 +-
 src/TNL/Algorithms/detail/Reduction.h         | 138 +-----------------
 src/TNL/Algorithms/detail/Reduction.hpp       |  48 +++---
 src/TNL/Algorithms/detail/Scan.h              |  18 +--
 src/TNL/Algorithms/detail/Scan.hpp            |  60 ++++----
 src/TNL/Algorithms/distributedScan.h          |  60 ++++----
 src/TNL/Algorithms/reduce.h                   |  47 +++---
 src/TNL/Algorithms/scan.h                     |  80 +++++-----
 .../Expressions/VerticalOperations.h          |  20 +--
 src/TNL/Functional.h                          |  20 +--
 src/TNL/Matrices/DenseMatrix.h                |  26 ++--
 src/TNL/Matrices/DenseMatrix.hpp              |  18 +--
 src/TNL/Matrices/DenseMatrixView.h            |  24 ++-
 src/TNL/Matrices/DenseMatrixView.hpp          |  20 +--
 src/TNL/Matrices/LambdaMatrix.h               |  12 +-
 src/TNL/Matrices/LambdaMatrix.hpp             |  12 +-
 src/TNL/Matrices/MultidiagonalMatrix.h        |  26 ++--
 src/TNL/Matrices/MultidiagonalMatrix.hpp      |  16 +-
 src/TNL/Matrices/MultidiagonalMatrixView.h    |  24 ++-
 src/TNL/Matrices/MultidiagonalMatrixView.hpp  |  18 +--
 src/TNL/Matrices/SparseMatrix.h               |  24 ++-
 src/TNL/Matrices/SparseMatrix.hpp             |  16 +-
 src/TNL/Matrices/SparseMatrixView.h           |  24 ++-
 src/TNL/Matrices/SparseMatrixView.hpp         |  20 +--
 src/TNL/Matrices/TridiagonalMatrix.h          |  24 ++-
 src/TNL/Matrices/TridiagonalMatrix.hpp        |  16 +-
 src/TNL/Matrices/TridiagonalMatrixView.h      |  24 ++-
 src/TNL/Matrices/TridiagonalMatrixView.hpp    |  18 +--
 src/UnitTests/Algorithms/SegmentedScanTest.h  |  12 +-
 36 files changed, 546 insertions(+), 582 deletions(-)

diff --git a/src/TNL/Algorithms/Multireduction.h b/src/TNL/Algorithms/Multireduction.h
index 8e63fa7ea..9e50afdb7 100644
--- a/src/TNL/Algorithms/Multireduction.h
+++ b/src/TNL/Algorithms/Multireduction.h
@@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential >
              typename Reduction,
              typename Index >
    static constexpr void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
@@ -58,7 +60,9 @@ struct Multireduction< Devices::Host >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -74,7 +78,7 @@ struct Multireduction< Devices::Host >
              typename Reduction,
              typename Index >
    static void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
@@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda >
              typename Reduction,
              typename Index >
    static void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp
index 4eb8a9369..ca7eec8d2 100644
--- a/src/TNL/Algorithms/Multireduction.hpp
+++ b/src/TNL/Algorithms/Multireduction.hpp
@@ -35,7 +35,7 @@ template< typename Result,
           typename Index >
 void constexpr
 Multireduction< Devices::Sequential >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -53,7 +53,7 @@ reduce( const Result zero,
       // (it is accessed as a row-major matrix with n rows and 4 columns)
       Result r[ n * 4 ];
       for( int k = 0; k < n * 4; k++ )
-         r[ k ] = zero;
+         r[ k ] = identity;
 
       // main reduction (explicitly unrolled loop)
       for( int b = 0; b < blocks; b++ ) {
@@ -89,7 +89,7 @@ reduce( const Result zero,
    }
    else {
       for( int k = 0; k < n; k++ )
-         result[ k ] = zero;
+         result[ k ] = identity;
 
       for( int b = 0; b < blocks; b++ ) {
          const Index offset = b * block_size;
@@ -112,7 +112,7 @@ template< typename Result,
           typename Index >
 void
 Multireduction< Devices::Host >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -134,14 +134,14 @@ reduce( const Result zero,
          #pragma omp single nowait
          {
             for( int k = 0; k < n; k++ )
-               result[ k ] = zero;
+               result[ k ] = identity;
          }
 
          // initialize array for thread-local results
          // (it is accessed as a row-major matrix with n rows and 4 columns)
          Result r[ n * 4 ];
          for( int k = 0; k < n * 4; k++ )
-            r[ k ] = zero;
+            r[ k ] = identity;
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
@@ -185,7 +185,7 @@ reduce( const Result zero,
    }
    else
 #endif
-      Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
+      Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result );
 }
 
 template< typename Result,
@@ -194,7 +194,7 @@ template< typename Result,
           typename Index >
 void
 Multireduction< Devices::Cuda >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -212,7 +212,7 @@ reduce( const Result zero,
 
    // start the reduction on the GPU
    Result* deviceAux1 = nullptr;
-   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
+   const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
@@ -234,7 +234,7 @@ reduce( const Result zero,
 
    // finish the reduction on the host
    auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
-   Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
+   Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Algorithms/SegmentedScan.h b/src/TNL/Algorithms/SegmentedScan.h
index 10412b747..dbcf4260b 100644
--- a/src/TNL/Algorithms/SegmentedScan.h
+++ b/src/TNL/Algorithms/SegmentedScan.h
@@ -81,8 +81,9 @@ struct SegmentedScan< Devices::Sequential, Type >
     * \param begin the first element in the array to be scanned
     * \param end the last element in the array to be scanned
     * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
     *
@@ -107,7 +108,7 @@ struct SegmentedScan< Devices::Sequential, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::ValueType zero );
+            const typename Vector::ValueType identity );
 };
 
 template< detail::ScanType Type >
@@ -125,8 +126,9 @@ struct SegmentedScan< Devices::Host, Type >
     * \param begin the first element in the array to be scanned
     * \param end the last element in the array to be scanned
     * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
     *
@@ -151,7 +153,7 @@ struct SegmentedScan< Devices::Host, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::ValueType zero );
+            const typename Vector::ValueType identity );
 };
 
 template< detail::ScanType Type >
@@ -169,8 +171,9 @@ struct SegmentedScan< Devices::Cuda, Type >
     * \param begin the first element in the array to be scanned
     * \param end the last element in the array to be scanned
     * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
     *
     * The reduction lambda function takes two variables which are supposed to be reduced:
     *
@@ -197,7 +200,7 @@ struct SegmentedScan< Devices::Cuda, Type >
             const typename Vector::IndexType begin,
             const typename Vector::IndexType end,
             const Reduction& reduction,
-            const typename Vector::ValueType zero );
+            const typename Vector::ValueType identity );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/SegmentedScan.hpp b/src/TNL/Algorithms/SegmentedScan.hpp
index 18427a79d..27e5efe71 100644
--- a/src/TNL/Algorithms/SegmentedScan.hpp
+++ b/src/TNL/Algorithms/SegmentedScan.hpp
@@ -30,7 +30,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::ValueType zero )
+         const typename Vector::ValueType identity )
 {
    using ValueType = typename Vector::ValueType;
    using IndexType = typename Vector::IndexType;
@@ -44,12 +44,12 @@ perform( Vector& v,
    else // Exclusive scan
    {
       ValueType aux( v[ begin ] );
-      v[ begin ] = zero;
+      v[ begin ] = identity;
       for( IndexType i = begin + 1; i < end; i++ )
       {
          ValueType x = v[ i ];
          if( flags[ i ] )
-            aux = zero;
+            aux = identity;
          v[ i ] = aux;
          aux = reduction( aux, x );
       }
@@ -67,13 +67,13 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::ValueType zero )
+         const typename Vector::ValueType identity )
 {
 #ifdef HAVE_OPENMP
    // TODO: parallelize with OpenMP
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
 #else
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
 #endif
 }
 
@@ -88,7 +88,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::ValueType zero )
+         const typename Vector::ValueType identity )
 {
 #ifdef HAVE_CUDA
    using ValueType = typename Vector::ValueType;
diff --git a/src/TNL/Algorithms/detail/CudaMultireductionKernel.h b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
index 973d8e958..c40d0602e 100644
--- a/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
@@ -47,7 +47,7 @@ template< int blockSizeX,
           typename Index >
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
-CudaMultireductionKernel( const Result zero,
+CudaMultireductionKernel( const Result identity,
                           DataFetcher dataFetcher,
                           const Reduction reduction,
                           const Index size,
@@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero,
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if( y >= n ) return;
 
-   sdata[ tid ] = zero;
+   sdata[ tid ] = identity;
 
    // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSizeX < size ) {
@@ -145,7 +145,7 @@ template< typename Result,
           typename Reduction,
           typename Index >
 int
-CudaMultireductionKernelLauncher( const Result zero,
+CudaMultireductionKernelLauncher( const Result identity,
                                   DataFetcher dataFetcher,
                                   const Reduction reduction,
                                   const Index size,
@@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero,
    {
       case 512:
          CudaMultireductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case 256:
          cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case 128:
          cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  64:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  32:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  16:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
      case   8:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case   4:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
       case   2:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case   1:
          throw std::logic_error( "blockSize should not be 1." );
diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 2c5c0ddf1..60100c628 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -431,7 +431,7 @@ struct CudaReductionKernelLauncher
              typename Reduction >
    int start( const Reduction& reduction,
               DataFetcher& dataFetcher,
-              const Result& zero,
+              const Result& identity,
               Result*& output )
    {
       // create reference to the reduction buffer singleton and set size
@@ -440,7 +440,7 @@ struct CudaReductionKernelLauncher
       cudaReductionBuffer.setSize( buf_size );
       output = cudaReductionBuffer.template getData< Result >();
 
-      this->reducedSize = this->launch( begin, end, reduction, dataFetcher, zero, output );
+      this->reducedSize = this->launch( begin, end, reduction, dataFetcher, identity, output );
       return this->reducedSize;
    }
 
@@ -448,7 +448,7 @@ struct CudaReductionKernelLauncher
              typename Reduction >
    int startWithArgument( const Reduction& reduction,
                           DataFetcher& dataFetcher,
-                          const Result& zero,
+                          const Result& identity,
                           Result*& output,
                           Index*& idxOutput )
    {
@@ -459,14 +459,14 @@ struct CudaReductionKernelLauncher
       output = cudaReductionBuffer.template getData< Result >();
       idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );
 
-      this->reducedSize = this->launchWithArgument( begin, end, reduction, dataFetcher, zero, output, idxOutput, nullptr );
+      this->reducedSize = this->launchWithArgument( begin, end, reduction, dataFetcher, identity, output, idxOutput, nullptr );
       return this->reducedSize;
    }
 
    template< typename Reduction >
    Result
    finish( const Reduction& reduction,
-           const Result& zero )
+           const Result& identity )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
@@ -477,7 +477,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launch( 0, this->reducedSize, reduction, copyFetch, zero, output );
+         this->reducedSize = this->launch( 0, this->reducedSize, reduction, copyFetch, identity, output );
          std::swap( input, output );
       }
 
@@ -494,7 +494,7 @@ struct CudaReductionKernelLauncher
    template< typename Reduction >
    std::pair< Result, Index >
    finishWithArgument( const Reduction& reduction,
-                       const Result& zero )
+                       const Result& identity )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
@@ -507,7 +507,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launchWithArgument( ( Index ) 0, this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput );
+         this->reducedSize = this->launchWithArgument( ( Index ) 0, this->reducedSize, reduction, copyFetch, identity, output, idxOutput, idxInput );
          std::swap( input, output );
          std::swap( idxInput, idxOutput );
       }
@@ -533,7 +533,7 @@ struct CudaReductionKernelLauncher
                   const Index end,
                   const Reduction& reduction,
                   DataFetcher& dataFetcher,
-                  const Result& zero,
+                  const Result& identity,
                   Result* output )
       {
 #ifdef HAVE_CUDA
@@ -550,55 +550,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionKernel< 512 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case 256:
                cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 256 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case 128:
                cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 128 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case  64:
                cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  64 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case  32:
                cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  32 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case  16:
                cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  16 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
            case   8:
                cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   8 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case   4:
                cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   4 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case   2:
                cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   2 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -615,7 +615,7 @@ struct CudaReductionKernelLauncher
 
             // shared memory is allocated statically inside the kernel
             CudaReductionKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output);
+            <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, begin, end, output);
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
@@ -636,7 +636,7 @@ struct CudaReductionKernelLauncher
                               const Index end,
                               const Reduction& reduction,
                               DataFetcher& dataFetcher,
-                              const Result& zero,
+                              const Result& identity,
                               Result* output,
                               Index* idxOutput,
                               const Index* idxInput )
@@ -655,55 +655,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionWithArgumentKernel< 512 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case 256:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 256 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case 128:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 128 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  64:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  64 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  32:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  32 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  16:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  16 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
            case   8:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   8 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   4:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   4 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   2:
                cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   2 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -720,7 +720,7 @@ struct CudaReductionKernelLauncher
 
             // shared memory is allocated statically inside the kernel
             CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
+            <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 41f304d46..eee63717b 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -47,8 +47,8 @@ struct CudaBlockScan
     * result of the scan according to its ID.
     *
     * \param reduction    The binary reduction functor.
-    * \param zero         Neutral element for given reduction operation, i.e.
-    *                     value such that `reduction(zero, x) == x` for any `x`.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
     * \param threadValue  Value of the calling thread to be reduced.
     * \param tid          Index of the calling thread (usually `threadIdx.x`,
     *                     unless you know what you are doing).
@@ -58,7 +58,7 @@ struct CudaBlockScan
    __device__ static
    ValueType
    scan( const Reduction& reduction,
-         ValueType zero,
+         ValueType identity,
          ValueType threadValue,
          int tid,
          Storage& storage )
@@ -108,7 +108,7 @@ struct CudaBlockScan
       if( scanType == ScanType::Exclusive ) {
          storage.chunkResults[ chunkResultIdx ] = threadValue;
          __syncthreads();
-         threadValue = (tid == 0) ? zero : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ];
+         threadValue = (tid == 0) ? identity : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ];
       }
 
       __syncthreads();
@@ -148,8 +148,8 @@ struct CudaTileScan
     *                     must be at least `end - begin` elements in the output
     *                     array starting at the position given by `outputBegin`.
     * \param reduction    The binary reduction functor.
-    * \param zero         Neutral element for given reduction operation, i.e.
-    *                     value such that `reduction(zero, x) == x` for any `x`.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
     * \param shift        A global shift to be applied to all elements in the
     *                     chunk processed by this thread.
     * \param storage      Auxiliary storage (must be allocated as a __shared__
@@ -165,7 +165,7 @@ struct CudaTileScan
          typename InputView::IndexType end,
          typename OutputView::IndexType outputBegin,
          const Reduction& reduction,
-         ValueType zero,
+         ValueType identity,
          ValueType shift,
          Storage& storage )
    {
@@ -194,11 +194,11 @@ struct CudaTileScan
             begin += blockDim.x;
             idx += blockDim.x;
          }
-         // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
+         // fill the remaining (maxElementsInBlock - elementsInBlock) values with identity
          // (this helps to avoid divergent branches in the blocks below)
          while( idx < maxElementsInBlock )
          {
-            storage.data[ idx ] = zero;
+            storage.data[ idx ] = identity;
             idx += blockDim.x;
          }
       }
@@ -212,7 +212,7 @@ struct CudaTileScan
          value = reduction( value, storage.data[ chunkOffset + i ] );
 
       // Scan the spine to obtain the initial value ("offset") for the downsweep.
-      value = BlockScan::scan( reduction, zero, value, threadIdx.x, storage.blockScanStorage );
+      value = BlockScan::scan( reduction, identity, value, threadIdx.x, storage.blockScanStorage );
 
       // Apply the global shift.
       value = reduction( value, shift );
@@ -258,7 +258,7 @@ CudaScanKernelUpsweep( const InputView input,
                        typename InputView::IndexType begin,
                        typename InputView::IndexType end,
                        Reduction reduction,
-                       ValueType zero,
+                       ValueType identity,
                        ValueType* reductionResults )
 {
    // verify the configuration
@@ -293,11 +293,11 @@ CudaScanKernelUpsweep( const InputView input,
          begin += blockDim.x;
          idx += blockDim.x;
       }
-      // fill the remaining (maxElementsInBlock - elementsInBlock) values with zero
+      // fill the remaining (maxElementsInBlock - elementsInBlock) values with identity
       // (this helps to avoid divergent branches in the blocks below)
       while( idx < maxElementsInBlock )
       {
-         storage.data[ idx ] = zero;
+         storage.data[ idx ] = identity;
          idx += blockDim.x;
       }
    }
@@ -335,7 +335,7 @@ CudaScanKernelDownsweep( const InputView input,
                          typename InputView::IndexType end,
                          typename OutputView::IndexType outputBegin,
                          Reduction reduction,
-                         typename OutputView::ValueType zero,
+                         typename OutputView::ValueType identity,
                          typename OutputView::ValueType shift,
                          const typename OutputView::ValueType* reductionResults )
 {
@@ -349,7 +349,7 @@ CudaScanKernelDownsweep( const InputView input,
    shift = reduction( shift, reductionResults[ blockIdx.x ] );
 
    // scan from input into output
-   TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, shift, storage );
+   TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, shift, storage );
 }
 
 /* CudaScanKernelParallel - scan each tile of the input separately in each CUDA
@@ -369,7 +369,7 @@ CudaScanKernelParallel( const InputView input,
                         typename InputView::IndexType end,
                         typename OutputView::IndexType outputBegin,
                         Reduction reduction,
-                        typename OutputView::ValueType zero,
+                        typename OutputView::ValueType identity,
                         typename OutputView::ValueType* blockResults )
 {
    using ValueType = typename OutputView::ValueType;
@@ -379,7 +379,7 @@ CudaScanKernelParallel( const InputView input,
    __shared__ typename TileScan::Storage storage;
 
    // scan from input into output
-   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, zero, zero, storage );
+   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, identity, storage );
 
    // The last thread of the block stores the block result in the global memory.
    if( blockResults && threadIdx.x == blockDim.x - 1 )
@@ -454,10 +454,10 @@ struct CudaScanKernelLauncher
     * \param outputBegin the first element in the output array to be written. There
     *                    must be at least `end - begin` elements in the output
     *                    array starting at the position given by `outputBegin`.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param zero  Neutral element for given reduction operation, i.e. value such that
-    *              `reduction(zero, x) == x` for any `x`.
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
     */
    template< typename InputArray,
              typename OutputArray,
@@ -469,7 +469,7 @@ struct CudaScanKernelLauncher
             typename InputArray::IndexType end,
             typename OutputArray::IndexType outputBegin,
             Reduction&& reduction,
-            typename OutputArray::ValueType zero )
+            typename OutputArray::ValueType identity )
    {
       const auto blockShifts = performFirstPhase(
          input,
@@ -478,7 +478,7 @@ struct CudaScanKernelLauncher
          end,
          outputBegin,
          reduction,
-         zero );
+         identity );
 
       // if the first-phase kernel was launched with just one block, skip the second phase
       if( blockShifts.getSize() <= 2 )
@@ -492,8 +492,8 @@ struct CudaScanKernelLauncher
          end,
          outputBegin,
          reduction,
-         zero,
-         zero );
+         identity,
+         identity );
    }
 
    /****
@@ -506,10 +506,10 @@ struct CudaScanKernelLauncher
     * \param outputBegin the first element in the output array to be written. There
     *                    must be at least `end - begin` elements in the output
     *                    array starting at the position given by `outputBegin`.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param zero  Neutral value for given reduction operation, i.e. value such that
-    *              `reduction(zero, x) == x` for any `x`.
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
     */
    template< typename InputArray,
              typename OutputArray,
@@ -521,7 +521,7 @@ struct CudaScanKernelLauncher
                       typename InputArray::IndexType end,
                       typename OutputArray::IndexType outputBegin,
                       Reduction&& reduction,
-                      typename OutputArray::ValueType zero )
+                      typename OutputArray::ValueType identity )
    {
       static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
       using Index = typename InputArray::IndexType;
@@ -530,7 +530,7 @@ struct CudaScanKernelLauncher
          // allocate array for the block results
          Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
          blockResults.setSize( 2 );
-         blockResults.setElement( 0, zero );
+         blockResults.setElement( 0, identity );
 
          // run the kernel with just 1 block
          if( end - begin <= blockSize )
@@ -541,8 +541,8 @@ struct CudaScanKernelLauncher
                  end,
                  outputBegin,
                  reduction,
-                 zero,
-                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
                  &blockResults.getData()[ 1 ] );
          else if( end - begin <= blockSize * 3 )
             CudaScanKernelParallel< scanType, blockSize, 3 ><<< 1, blockSize >>>
@@ -552,8 +552,8 @@ struct CudaScanKernelLauncher
                  end,
                  outputBegin,
                  reduction,
-                 zero,
-                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
                  &blockResults.getData()[ 1 ] );
          else if( end - begin <= blockSize * 5 )
             CudaScanKernelParallel< scanType, blockSize, 5 ><<< 1, blockSize >>>
@@ -563,8 +563,8 @@ struct CudaScanKernelLauncher
                  end,
                  outputBegin,
                  reduction,
-                 zero,
-                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
                  &blockResults.getData()[ 1 ] );
          else
             CudaScanKernelParallel< scanType, blockSize, valuesPerThread ><<< 1, blockSize >>>
@@ -574,8 +574,8 @@ struct CudaScanKernelLauncher
                  end,
                  outputBegin,
                  reduction,
-                 zero,
-                 // blockResults are shifted by 1, because the 0-th element should stay zero
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
                  &blockResults.getData()[ 1 ] );
 
          // synchronize the null-stream
@@ -621,7 +621,7 @@ struct CudaScanKernelLauncher
                        begin + gridOffset + currentSize,
                        outputBegin + gridOffset,
                        reduction,
-                       zero,
+                       identity,
                        &blockResults.getData()[ gridIdx * maxGridSize() ] );
                   break;
 
@@ -631,7 +631,7 @@ struct CudaScanKernelLauncher
                        begin + gridOffset,
                        begin + gridOffset + currentSize,
                        reduction,
-                       zero,
+                       identity,
                        &blockResults.getData()[ gridIdx * maxGridSize() ] );
                   break;
             }
@@ -650,7 +650,7 @@ struct CudaScanKernelLauncher
             blockResults.getSize(),
             0,
             reduction,
-            zero );
+            identity );
 
          // Store the number of CUDA grids for the purpose of unit testing, i.e.
          // to check if we test the algorithm with more than one CUDA grid.
@@ -673,10 +673,12 @@ struct CudaScanKernelLauncher
     * \param outputBegin the first element in the output array to be written. There
     *                    must be at least `end - begin` elements in the output
     *                    array starting at the position given by `outputBegin`.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param shift  A constant shifting all elements of the array (usually `zero`, i.e.
-    *               the neutral value).
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
+    * \param shift A constant shifting all elements of the array (usually
+    *              `identity`, i.e. the neutral value).
     */
    template< typename InputArray,
              typename OutputArray,
@@ -690,7 +692,7 @@ struct CudaScanKernelLauncher
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType identity,
                        typename OutputArray::ValueType shift )
    {
       static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
@@ -745,7 +747,7 @@ struct CudaScanKernelLauncher
                        begin + gridOffset + currentSize,
                        outputBegin + gridOffset,
                        reduction,
-                       zero,
+                       identity,
                        shift,
                        &blockShifts.getData()[ gridIdx * maxGridSize() ] );
                   break;
diff --git a/src/TNL/Algorithms/detail/DistributedScan.h b/src/TNL/Algorithms/detail/DistributedScan.h
index 25900c12d..933056d92 100644
--- a/src/TNL/Algorithms/detail/DistributedScan.h
+++ b/src/TNL/Algorithms/detail/DistributedScan.h
@@ -33,7 +33,7 @@ struct DistributedScan
             typename InputDistributedArray::IndexType begin,
             typename InputDistributedArray::IndexType end,
             Reduction&& reduction,
-            typename OutputDistributedArray::ValueType zero )
+            typename OutputDistributedArray::ValueType identity )
    {
       using ValueType = typename OutputDistributedArray::ValueType;
       using DeviceType = typename OutputDistributedArray::DeviceType;
@@ -48,7 +48,7 @@ struct DistributedScan
          // perform first phase on the local data
          const auto inputLocalView = input.getConstLocalView();
          auto outputLocalView = output.getLocalView();
-         const auto block_results = Scan< DeviceType, Type, PhaseType >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, zero );
+         const auto block_results = Scan< DeviceType, Type, PhaseType >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, identity );
          const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );
 
          // exchange local results between ranks
@@ -60,11 +60,11 @@ struct DistributedScan
          MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
 
          // compute the scan of the per-rank results
-         Scan< Devices::Host, ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( rank_results, rank_results, 0, nproc, 0, reduction, zero );
+         Scan< Devices::Host, ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( rank_results, rank_results, 0, nproc, 0, reduction, identity );
 
          // perform the second phase, using the per-block and per-rank results
          const int rank = MPI::GetRank( group );
-         Scan< DeviceType, Type, PhaseType >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, zero, rank_results[ rank ] );
+         Scan< DeviceType, Type, PhaseType >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, identity, rank_results[ rank ] );
       }
    }
 };
diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
index ca195077a..e06ad4bee 100644
--- a/src/TNL/Algorithms/detail/Reduction.h
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -22,41 +22,12 @@ namespace TNL {
    namespace Algorithms {
       namespace detail {
 
-/**
- * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
- *
- * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
- * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
- * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
- * position of the smallest or the largest element, reduction with argument can be used.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- *
- * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
- */
 template< typename Device >
 struct Reduction;
 
 template<>
 struct Reduction< Devices::Sequential >
 {
-   /**
-    * \brief Computes reduction on CPU sequentially.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -66,29 +37,8 @@ struct Reduction< Devices::Sequential >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -98,29 +48,12 @@ struct Reduction< Devices::Sequential >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
 template<>
 struct Reduction< Devices::Host >
 {
-   /**
-    * \brief Computes reduction on CPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -130,29 +63,8 @@ struct Reduction< Devices::Host >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam ReductionOperation is a lambda function performing the reduction.
-    * \tparam DataFetcher is a lambda function for fetching the input data.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -162,28 +74,12 @@ struct Reduction< Devices::Host >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
 template<>
 struct Reduction< Devices::Cuda >
 {
-   /**
-    * \brief Computes reduction on GPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -193,30 +89,8 @@ struct Reduction< Devices::Cuda >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes reduction on GPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -226,7 +100,7 @@ struct Reduction< Devices::Cuda >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
       } // namespace detail
diff --git a/src/TNL/Algorithms/detail/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp
index 945d6baa1..abd6c63f5 100644
--- a/src/TNL/Algorithms/detail/Reduction.hpp
+++ b/src/TNL/Algorithms/detail/Reduction.hpp
@@ -46,7 +46,7 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
    constexpr int block_size = 128;
    const Index size = end - begin;
@@ -54,7 +54,7 @@ reduce( const Index begin,
 
    if( blocks > 1 ) {
       // initialize array for unrolled results
-      Result r[ 4 ] = { zero, zero, zero, zero };
+      Result r[ 4 ] = { identity, identity, identity, identity };
 
       // main reduce (explicitly unrolled loop)
       for( Index b = 0; b < blocks; b++ ) {
@@ -78,7 +78,7 @@ reduce( const Index begin,
       return r[ 0 ];
    }
    else {
-      Result result = zero;
+      Result result = identity;
       for( Index i = begin; i < end; i++ )
          result = reduce( result, fetch( i ) );
       return result;
@@ -95,7 +95,7 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
    constexpr int block_size = 128;
    const Index size = end - begin;
@@ -104,7 +104,7 @@ reduceWithArgument( const Index begin,
    if( blocks > 1 ) {
       // initialize array for unrolled results
       Index arg[ 4 ] = { 0, 0, 0, 0 };
-      Result r[ 4 ] = { zero, zero, zero, zero };
+      Result r[ 4 ] = { identity, identity, identity, identity };
       bool initialized( false );
 
       // main reduce (explicitly unrolled loop)
@@ -143,7 +143,7 @@ reduceWithArgument( const Index begin,
    }
    else if( begin >= end ) {
       // trivial case, fetch should not be called in this case
-      return std::make_pair( zero, end );
+      return std::make_pair( identity, end );
    }
    else {
       std::pair< Result, Index > result( fetch( begin ), begin );
@@ -163,7 +163,7 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
 #ifdef HAVE_OPENMP
    constexpr int block_size = 128;
@@ -172,12 +172,12 @@ reduce( const Index begin,
 
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
-      Result result = zero;
+      Result result = identity;
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
 #pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
-         Result r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { identity, identity, identity, identity  };
 
          #pragma omp for nowait
          for( Index b = 0; b < blocks; b++ ) {
@@ -212,7 +212,7 @@ reduce( const Index begin,
    }
    else
 #endif
-      return Reduction< Devices::Sequential >::reduce( begin, end, fetch, reduce, zero );
+      return Reduction< Devices::Sequential >::reduce( begin, end, fetch, reduce, identity );
 }
 
 template< typename Index,
@@ -225,7 +225,7 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
 #ifdef HAVE_OPENMP
    constexpr int block_size = 128;
@@ -234,13 +234,13 @@ reduceWithArgument( const Index begin,
 
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
-      std::pair< Result, Index > result( zero, -1 );
+      std::pair< Result, Index > result( identity, -1 );
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
 #pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
          Index arg[ 4 ] = { 0, 0, 0, 0 };
-         Result r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { identity, identity, identity, identity  };
          bool initialized( false );
 
          #pragma omp for nowait
@@ -290,7 +290,7 @@ reduceWithArgument( const Index begin,
    }
    else
 #endif
-      return Reduction< Devices::Sequential >::reduceWithArgument( begin, end, fetch, reduce, zero );
+      return Reduction< Devices::Sequential >::reduceWithArgument( begin, end, fetch, reduce, identity );
 }
 
 template< typename Index,
@@ -303,11 +303,11 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
    // trivial case, nothing to reduce
    if( begin >= end )
-      return zero;
+      return identity;
 
    // Only fundamental and pointer types can be safely reduced on host. Complex
    // objects stored on the device might contain pointers into the device memory,
@@ -327,7 +327,7 @@ reduce( const Index begin,
    const int reducedSize = reductionLauncher.start(
       reduce,
       fetch,
-      zero,
+      identity,
       deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
@@ -364,7 +364,7 @@ reduce( const Index begin,
 
       // finish the reduce on the host
       auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-      const Result result = Reduction< Devices::Sequential >::reduce( 0, reducedSize, fetch, reduce, zero );
+      const Result result = Reduction< Devices::Sequential >::reduce( 0, reducedSize, fetch, reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -374,7 +374,7 @@ reduce( const Index begin,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduce on the GPU
-      auto result = reductionLauncher.finish( reduce, zero );
+      auto result = reductionLauncher.finish( reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -397,11 +397,11 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
    // trivial case, nothing to reduce
    if( begin >= end )
-      return std::make_pair( zero, end );
+      return std::make_pair( identity, end );
 
    // Only fundamental and pointer types can be safely reduced on host. Complex
    // objects stored on the device might contain pointers into the device memory,
@@ -422,7 +422,7 @@ reduceWithArgument( const Index begin,
    const int reducedSize = reductionLauncher.startWithArgument(
       reduce,
       fetch,
-      zero,
+      identity,
       deviceAux1,
       deviceIndexes );
 
@@ -475,7 +475,7 @@ reduceWithArgument( const Index begin,
 
       // finish the reduce on the host
 //      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduce, fetch, zero );
+//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduce, fetch, identity );
       for( Index i = 1; i < reducedSize; i++ )
          reduce( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ]  );
 
@@ -487,7 +487,7 @@ reduceWithArgument( const Index begin,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduce on the GPU
-      auto result = reductionLauncher.finishWithArgument( reduce, zero );
+      auto result = reductionLauncher.finishWithArgument( reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
diff --git a/src/TNL/Algorithms/detail/Scan.h b/src/TNL/Algorithms/detail/Scan.h
index c97a3f8fe..9a32452d9 100644
--- a/src/TNL/Algorithms/detail/Scan.h
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -38,7 +38,7 @@ struct Scan< Devices::Sequential, Type, PhaseType >
             typename InputArray::IndexType end,
             typename OutputArray::IndexType outputBegin,
             Reduction&& reduction,
-            typename OutputArray::ValueType zero );
+            typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -50,7 +50,7 @@ struct Scan< Devices::Sequential, Type, PhaseType >
                       typename InputArray::IndexType end,
                       typename OutputArray::IndexType outputBegin,
                       Reduction&& reduction,
-                      typename OutputArray::ValueType zero );
+                      typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -64,7 +64,7 @@ struct Scan< Devices::Sequential, Type, PhaseType >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType identity,
                        typename OutputArray::ValueType shift );
 };
 
@@ -81,7 +81,7 @@ struct Scan< Devices::Host, Type, PhaseType >
             typename InputArray::IndexType end,
             typename OutputArray::IndexType outputBegin,
             Reduction&& reduction,
-            typename OutputArray::ValueType zero );
+            typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -93,7 +93,7 @@ struct Scan< Devices::Host, Type, PhaseType >
                       typename InputArray::IndexType end,
                       typename OutputArray::IndexType outputBegin,
                       Reduction&& reduction,
-                      typename OutputArray::ValueType zero );
+                      typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -107,7 +107,7 @@ struct Scan< Devices::Host, Type, PhaseType >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType identity,
                        typename OutputArray::ValueType shift );
 };
 
@@ -124,7 +124,7 @@ struct Scan< Devices::Cuda, Type, PhaseType >
             typename InputArray::IndexType end,
             typename OutputArray::IndexType outputBegin,
             Reduction&& reduction,
-            typename OutputArray::ValueType zero );
+            typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -136,7 +136,7 @@ struct Scan< Devices::Cuda, Type, PhaseType >
                       typename InputArray::IndexType end,
                       typename OutputArray::IndexType outputBegin,
                       Reduction&& reduction,
-                      typename OutputArray::ValueType zero );
+                      typename OutputArray::ValueType identity );
 
    template< typename InputArray,
              typename OutputArray,
@@ -150,7 +150,7 @@ struct Scan< Devices::Cuda, Type, PhaseType >
                        typename InputArray::IndexType end,
                        typename OutputArray::IndexType outputBegin,
                        Reduction&& reduction,
-                       typename OutputArray::ValueType zero,
+                       typename OutputArray::ValueType identity,
                        typename OutputArray::ValueType shift );
 };
 
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
index 4329294da..383d2b5e2 100644
--- a/src/TNL/Algorithms/detail/Scan.hpp
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -39,12 +39,12 @@ perform( const InputArray& input,
          typename InputArray::IndexType end,
          typename OutputArray::IndexType outputBegin,
          Reduction&& reduction,
-         typename OutputArray::ValueType zero )
+         typename OutputArray::ValueType identity )
 {
    using ValueType = typename OutputArray::ValueType;
 
    // simple sequential algorithm - not split into phases
-   ValueType aux = zero;
+   ValueType aux = identity;
    if( Type == ScanType::Inclusive ) {
       for( ; begin < end; begin++, outputBegin++ )
          output[ outputBegin ] = aux = reduction( aux, input[ begin ] );
@@ -73,11 +73,11 @@ performFirstPhase( const InputArray& input,
                    typename InputArray::IndexType end,
                    typename OutputArray::IndexType outputBegin,
                    Reduction&& reduction,
-                   typename OutputArray::ValueType zero )
+                   typename OutputArray::ValueType identity )
 {
    if( end <= begin ) {
       Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 1 );
-      block_results.setValue( zero );
+      block_results.setValue( identity );
       return block_results;
    }
 
@@ -87,8 +87,8 @@ performFirstPhase( const InputArray& input,
       {
          // artificial second phase - pre-scan the block
          Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
-         block_results[ 0 ] = zero;
-         block_results[ 1 ] = perform( input, output, begin, end, outputBegin, reduction, zero );
+         block_results[ 0 ] = identity;
+         block_results[ 1 ] = perform( input, output, begin, end, outputBegin, reduction, identity );
          return block_results;
       }
 
@@ -96,8 +96,8 @@ performFirstPhase( const InputArray& input,
       {
          // artificial first phase - only reduce the block
          Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
-         block_results[ 0 ] = zero;
-         block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, zero );
+         block_results[ 0 ] = identity;
+         block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, identity );
          return block_results;
       }
    };
@@ -117,7 +117,7 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType identity,
                     typename OutputArray::ValueType shift )
 {
    switch( PhaseType )
@@ -153,7 +153,7 @@ perform( const InputArray& input,
          typename InputArray::IndexType end,
          typename OutputArray::IndexType outputBegin,
          Reduction&& reduction,
-         typename OutputArray::ValueType zero )
+         typename OutputArray::ValueType identity )
 {
 #ifdef HAVE_OPENMP
    using ValueType = typename OutputArray::ValueType;
@@ -184,14 +184,14 @@ perform( const InputArray& input,
             case ScanPhaseType::WriteInFirstPhase:
             {
                // step 1: pre-scan the block and save the result of the block reduction
-               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, zero );
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, identity );
 
                #pragma omp barrier
 
                // step 2: scan the block results
                #pragma omp single
                {
-                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
                }
 
                // step 3: uniform shift of the pre-scanned block
@@ -206,14 +206,14 @@ perform( const InputArray& input,
             case ScanPhaseType::WriteInSecondPhase:
             {
                // step 1: per-block reductions, write the result into the buffer
-               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, identity );
 
                #pragma omp barrier
 
                // step 2: scan the block results
                #pragma omp single
                {
-                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
                }
 
                // step 3: per-block scan using the block results as initial values
@@ -226,7 +226,7 @@ perform( const InputArray& input,
    }
    else
 #endif
-      Scan< Devices::Sequential, Type >::perform( input, output, begin, end, outputBegin, reduction, zero );
+      Scan< Devices::Sequential, Type >::perform( input, output, begin, end, outputBegin, reduction, identity );
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -241,7 +241,7 @@ performFirstPhase( const InputArray& input,
                    typename InputArray::IndexType end,
                    typename OutputArray::IndexType outputBegin,
                    Reduction&& reduction,
-                   typename OutputArray::ValueType zero )
+                   typename OutputArray::ValueType identity )
 {
 #ifdef HAVE_OPENMP
    using ValueType = typename OutputArray::ValueType;
@@ -249,7 +249,7 @@ performFirstPhase( const InputArray& input,
 
    if( end <= begin ) {
       Containers::Array< ValueType, Devices::Sequential > block_results( 1 );
-      block_results.setValue( zero );
+      block_results.setValue( identity );
       return block_results;
    }
 
@@ -275,28 +275,28 @@ performFirstPhase( const InputArray& input,
             case ScanPhaseType::WriteInFirstPhase:
             {
                // pre-scan the block, write the result of the block reduction into the buffer
-               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, zero );
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, identity );
                break;
             }
 
             case ScanPhaseType::WriteInSecondPhase:
             {
                // upsweep: per-block reductions, write the result into the buffer
-               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, zero );
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, identity );
                break;
             }
          }
       }
 
       // spine step: scan the block results
-      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, zero );
+      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
 
       // block_results now contains shift values for each block - to be used in the second phase
       return block_results;
    }
    else
 #endif
-      return Scan< Devices::Sequential, Type >::performFirstPhase( input, output, begin, end, outputBegin, reduction, zero );
+      return Scan< Devices::Sequential, Type >::performFirstPhase( input, output, begin, end, outputBegin, reduction, identity );
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -313,7 +313,7 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType identity,
                     typename OutputArray::ValueType shift )
 {
 #ifdef HAVE_OPENMP
@@ -362,7 +362,7 @@ performSecondPhase( const InputArray& input,
    }
    else
 #endif
-      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, zero, shift );
+      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, identity, shift );
 }
 
 template< ScanType Type, ScanPhaseType PhaseType >
@@ -377,7 +377,7 @@ perform( const InputArray& input,
          typename InputArray::IndexType end,
          typename OutputArray::IndexType outputBegin,
          Reduction&& reduction,
-         typename OutputArray::ValueType zero )
+         typename OutputArray::ValueType identity )
 {
 #ifdef HAVE_CUDA
    if( end <= begin )
@@ -390,7 +390,7 @@ perform( const InputArray& input,
       end,
       outputBegin,
       std::forward< Reduction >( reduction ),
-      zero );
+      identity );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -408,12 +408,12 @@ performFirstPhase( const InputArray& input,
                    typename InputArray::IndexType end,
                    typename OutputArray::IndexType outputBegin,
                    Reduction&& reduction,
-                   typename OutputArray::ValueType zero )
+                   typename OutputArray::ValueType identity )
 {
 #ifdef HAVE_CUDA
    if( end <= begin ) {
       Containers::Array< typename OutputArray::ValueType, Devices::Cuda > block_results( 1 );
-      block_results.setValue( zero );
+      block_results.setValue( identity );
       return block_results;
    }
 
@@ -424,7 +424,7 @@ performFirstPhase( const InputArray& input,
       end,
       outputBegin,
       std::forward< Reduction >( reduction ),
-      zero );
+      identity );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
@@ -444,7 +444,7 @@ performSecondPhase( const InputArray& input,
                     typename InputArray::IndexType end,
                     typename OutputArray::IndexType outputBegin,
                     Reduction&& reduction,
-                    typename OutputArray::ValueType zero,
+                    typename OutputArray::ValueType identity,
                     typename OutputArray::ValueType shift )
 {
 #ifdef HAVE_CUDA
@@ -459,7 +459,7 @@ performSecondPhase( const InputArray& input,
       end,
       outputBegin,
       std::forward< Reduction >( reduction ),
-      zero,
+      identity,
       shift );
 #else
    throw Exceptions::CudaSupportMissing();
diff --git a/src/TNL/Algorithms/distributedScan.h b/src/TNL/Algorithms/distributedScan.h
index 573e9ac7d..39724f10a 100644
--- a/src/TNL/Algorithms/distributedScan.h
+++ b/src/TNL/Algorithms/distributedScan.h
@@ -38,8 +38,9 @@ namespace Algorithms {
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -56,7 +57,7 @@ distributedInclusiveScan( const InputDistributedArray& input,
                           typename InputDistributedArray::IndexType begin,
                           typename InputDistributedArray::IndexType end,
                           Reduction&& reduction,
-                          typename OutputDistributedArray::ValueType zero )
+                          typename OutputDistributedArray::ValueType identity )
 {
    static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
                   "The input and output arrays must have the same device type." );
@@ -66,7 +67,7 @@ distributedInclusiveScan( const InputDistributedArray& input,
                   "The input and output arrays must have the same local range on all ranks." );
    // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
    using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
-   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
    output.startSynchronization();
 }
 
@@ -74,7 +75,7 @@ distributedInclusiveScan( const InputDistributedArray& input,
  * \brief Overload of \ref distributedInclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename OutputDistributedArray::ValueType >()`.
+ * The identity element is taken as `reduction.template getIdentity< typename OutputDistributedArray::ValueType >()`.
  * See \ref distributedInclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
  */
@@ -90,8 +91,8 @@ distributedInclusiveScan( const InputDistributedArray& input,
 {
    if( end == 0 )
       end = input.getSize();
-   constexpr typename OutputDistributedArray::ValueType zero = Reduction::template getIdempotent< typename OutputDistributedArray::ValueType >();
-   distributedInclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename OutputDistributedArray::ValueType identity = Reduction::template getIdentity< typename OutputDistributedArray::ValueType >();
+   distributedInclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -112,8 +113,9 @@ distributedInclusiveScan( const InputDistributedArray& input,
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -130,7 +132,7 @@ distributedExclusiveScan( const InputDistributedArray& input,
                           typename InputDistributedArray::IndexType begin,
                           typename InputDistributedArray::IndexType end,
                           Reduction&& reduction,
-                          typename OutputDistributedArray::ValueType zero )
+                          typename OutputDistributedArray::ValueType identity )
 {
    static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
                   "The input and output arrays must have the same device type." );
@@ -140,7 +142,7 @@ distributedExclusiveScan( const InputDistributedArray& input,
                   "The input and output arrays must have the same local range on all ranks." );
    // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
    using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
-   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
    output.startSynchronization();
 }
 
@@ -148,7 +150,7 @@ distributedExclusiveScan( const InputDistributedArray& input,
  * \brief Overload of \ref distributedExclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename OutputDistributedArray::ValueType >()`.
+ * The identity element is taken as `reduction.template getIdentity< typename OutputDistributedArray::ValueType >()`.
  * See \ref distributedExclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
  */
@@ -164,8 +166,8 @@ distributedExclusiveScan( const InputDistributedArray& input,
 {
    if( end == 0 )
       end = input.getSize();
-   constexpr typename OutputDistributedArray::ValueType zero = Reduction::template getIdempotent< typename OutputDistributedArray::ValueType >();
-   distributedExclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename OutputDistributedArray::ValueType identity = Reduction::template getIdentity< typename OutputDistributedArray::ValueType >();
+   distributedExclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -186,8 +188,9 @@ distributedExclusiveScan( const InputDistributedArray& input,
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -202,10 +205,10 @@ distributedInplaceInclusiveScan( DistributedArray& array,
                                  typename DistributedArray::IndexType begin,
                                  typename DistributedArray::IndexType end,
                                  Reduction&& reduction,
-                                 typename DistributedArray::ValueType zero )
+                                 typename DistributedArray::ValueType identity )
 {
    using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
-   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), identity );
    array.startSynchronization();
 }
 
@@ -213,7 +216,7 @@ distributedInplaceInclusiveScan( DistributedArray& array,
  * \brief Overload of \ref distributedInplaceInclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename DistributedArray::ValueType >()`.
+ * The identity element is taken as `reduction.template getIdentity< typename DistributedArray::ValueType >()`.
  * See \ref distributedInplaceInclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
  */
@@ -227,8 +230,8 @@ distributedInplaceInclusiveScan( DistributedArray& array,
 {
    if( end == 0 )
       end = array.getSize();
-   constexpr typename DistributedArray::ValueType zero = Reduction::template getIdempotent< typename DistributedArray::ValueType >();
-   distributedInplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename DistributedArray::ValueType identity = Reduction::template getIdentity< typename DistributedArray::ValueType >();
+   distributedInplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -249,8 +252,9 @@ distributedInplaceInclusiveScan( DistributedArray& array,
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -265,10 +269,10 @@ distributedInplaceExclusiveScan( DistributedArray& array,
                                  typename DistributedArray::IndexType begin,
                                  typename DistributedArray::IndexType end,
                                  Reduction&& reduction,
-                                 typename DistributedArray::ValueType zero )
+                                 typename DistributedArray::ValueType identity )
 {
    using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
-   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), identity );
    array.startSynchronization();
 }
 
@@ -276,7 +280,7 @@ distributedInplaceExclusiveScan( DistributedArray& array,
  * \brief Overload of \ref distributedInplaceExclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename DistributedArray::ValueType >()`.
+ * The identity element is taken as `reduction.template getIdentity< typename DistributedArray::ValueType >()`.
  * See \ref distributedInplaceExclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
  */
@@ -290,8 +294,8 @@ distributedInplaceExclusiveScan( DistributedArray& array,
 {
    if( end == 0 )
       end = array.getSize();
-   constexpr typename DistributedArray::ValueType zero = Reduction::template getIdempotent< typename DistributedArray::ValueType >();
-   distributedInplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename DistributedArray::ValueType identity = Reduction::template getIdentity< typename DistributedArray::ValueType >();
+   distributedInplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index 96680b550..a9eeed612 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -44,8 +44,9 @@ namespace Algorithms {
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
  * \param reduction is a lambda function defining the reduction operation.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  * \return result of the reduction
  *
  * The `fetch` lambda function takes one argument which is index of the element to be fetched:
@@ -77,13 +78,13 @@ Result reduce( const Index begin,
                const Index end,
                Fetch&& fetch,
                Reduction&& reduction,
-               const Result& zero )
+               const Result& identity )
 {
    return detail::Reduction< Device >::reduce( begin,
                                                end,
                                                std::forward< Fetch >( fetch ),
                                                std::forward< Reduction >( reduction ),
-                                               zero );
+                                               identity );
 }
 
 /**
@@ -135,7 +136,7 @@ auto reduce( const Index begin,
                             end,
                             std::forward< Fetch >( fetch ),
                             std::forward< Reduction >( reduction ),
-                            reduction.template getIdempotent< Result >() );
+                            reduction.template getIdentity< Result >() );
 }
 
 /**
@@ -148,7 +149,7 @@ auto reduce( const Index begin,
  * - `array.getSize()` as the end of the interval for reduction,
  * - `array.getConstView()` as the `fetch` functor,
  * - `reduction` as the reduction operation,
- * - and `zero` as the idempotent element of the reduction.
+ * - and `identity` as the identity element of the reduction.
  *
  * \par Example
  *
@@ -164,13 +165,13 @@ template< typename Array,
           typename Result >
 auto reduce( const Array& array,
              Reduction&& reduction,
-             Result zero )
+             Result identity )
 {
    return reduce< Device >( (typename Array::IndexType) 0,
                             array.getSize(),
                             array.getConstView(),
                             std::forward< Reduction >( reduction ),
-                            zero );
+                            identity );
 }
 
 /**
@@ -187,7 +188,7 @@ auto reduce( const Array& array,
  * - `array.getSize()` as the end of the interval for reduction,
  * - `array.getConstView()` as the `fetch` functor,
  * - `reduction` as the reduction operation,
- * - and the idempotent element obtained from the reduction functional object.
+ * - and the identity element obtained from the reduction functional object.
  *
  * \par Example
  *
@@ -206,7 +207,7 @@ auto reduce( const Array& array,
    using ValueType = typename Array::ValueType;
    return reduce< Array, Device >( array,
                                    std::forward< Reduction >( reduction ),
-                                   reduction.template getIdempotent< ValueType >() );
+                                   reduction.template getIdentity< ValueType >() );
 }
 
 /**
@@ -229,8 +230,9 @@ auto reduce( const Array& array,
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
  * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
  *         is the element position and `pair.second` is the reduction result.
  *
@@ -264,13 +266,13 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduction&& reduction,
-                    const Result& zero )
+                    const Result& identity )
 {
    return detail::Reduction< Device >::reduceWithArgument( begin,
                                                            end,
                                                            std::forward< Fetch >( fetch ),
                                                            std::forward< Reduction >( reduction ),
-                                                           zero );
+                                                           identity );
 }
 
 /**
@@ -291,8 +293,9 @@ reduceWithArgument( const Index begin,
  * \param end defines range [begin, end) of indexes which will be used for the reduction.
  * \param fetch is a lambda function fetching the input data.
  * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
  *         is the element position and `pair.second` is the reduction result.
  *
@@ -331,7 +334,7 @@ reduceWithArgument( const Index begin,
                                         end,
                                         std::forward< Fetch >( fetch ),
                                         std::forward< Reduction >( reduction ),
-                                        reduction.template getIdempotent< Result >() );
+                                        reduction.template getIdentity< Result >() );
 }
 
 /**
@@ -344,7 +347,7 @@ reduceWithArgument( const Index begin,
  * - `array.getSize()` as the end of the interval for reduction,
  * - `array.getConstView()` as the `fetch` functor,
  * - `reduction` as the reduction operation,
- * - and `zero` as the idempotent element of the reduction.
+ * - and `identity` as the identity element of the reduction.
  *
  * \par Example
  *
@@ -360,13 +363,13 @@ template< typename Array,
           typename Result >
 auto reduceWithArgument( const Array& array,
                          Reduction&& reduction,
-                         Result zero )
+                         Result identity )
 {
    return reduceWithArgument< Device >( (typename Array::IndexType) 0,
                                         array.getSize(),
                                         array.getConstView(),
                                         std::forward< Reduction >( reduction ),
-                                        zero );
+                                        identity );
 }
 
 /**
@@ -381,7 +384,7 @@ auto reduceWithArgument( const Array& array,
  * - `array.getSize()` as the end of the interval for reduction,
  * - `array.getConstView()` as the `fetch` functor,
  * - `reduction` as the reduction operation,
- * - and the idempotent element obtained from the reduction functional object.
+ * - and the identity element obtained from the reduction functional object.
  *
  * \par Example
  *
@@ -400,7 +403,7 @@ auto reduceWithArgument( const Array& array,
    using ValueType = typename Array::ValueType;
    return reduceWithArgument< Array, Device >( array,
                                                std::forward< Reduction >( reduction ),
-                                               reduction.template getIdempotent< ValueType >() );
+                                               reduction.template getIdentity< ValueType >() );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
index 982106afc..30eb5ddc2 100644
--- a/src/TNL/Algorithms/scan.h
+++ b/src/TNL/Algorithms/scan.h
@@ -44,8 +44,9 @@ namespace Algorithms {
  *                    must be at least `end - begin` elements in the output
  *                    array starting at the position given by `outputBegin`.
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e.
- *             element which does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -71,22 +72,23 @@ inclusiveScan( const InputArray& input,
                typename InputArray::IndexType end,
                typename OutputArray::IndexType outputBegin,
                Reduction&& reduction,
-               typename OutputArray::ValueType zero )
+               typename OutputArray::ValueType identity )
 {
    static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
                   "The input and output arrays must have the same device type." );
-   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
-                  "zero is not an idempotent value of the reduction operation" );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
    // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
    using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
-   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
  * \brief Overload of \ref inclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename OutputArray::ValueType >()`.
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename OutputArray::ValueType >()`.
  * See \ref inclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
  */
@@ -103,8 +105,8 @@ inclusiveScan( const InputArray& input,
 {
    if( end == 0 )
       end = input.getSize();
-   constexpr typename OutputArray::ValueType zero = Reduction::template getIdempotent< typename OutputArray::ValueType >();
-   inclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+   constexpr typename OutputArray::ValueType identity = Reduction::template getIdentity< typename OutputArray::ValueType >();
+   inclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -131,8 +133,9 @@ inclusiveScan( const InputArray& input,
  *                    must be at least `end - begin` elements in the output
  *                    array starting at the position given by `outputBegin`.
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e.
- *             element which does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -158,22 +161,23 @@ exclusiveScan( const InputArray& input,
                typename InputArray::IndexType end,
                typename OutputArray::IndexType outputBegin,
                Reduction&& reduction,
-               typename OutputArray::ValueType zero )
+               typename OutputArray::ValueType identity )
 {
    static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
                   "The input and output arrays must have the same device type." );
-   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
-                  "zero is not an idempotent value of the reduction operation" );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
    // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
    using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
-   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
  * \brief Overload of \ref exclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename OutputArray::ValueType >()`.
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename OutputArray::ValueType >()`.
  * See \ref exclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
  */
@@ -190,8 +194,8 @@ exclusiveScan( const InputArray& input,
 {
    if( end == 0 )
       end = input.getSize();
-   constexpr typename OutputArray::ValueType zero = Reduction::template getIdempotent< typename OutputArray::ValueType >();
-   exclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), zero );
+   constexpr typename OutputArray::ValueType identity = Reduction::template getIdentity< typename OutputArray::ValueType >();
+   exclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -212,8 +216,9 @@ exclusiveScan( const InputArray& input,
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -236,19 +241,20 @@ inplaceInclusiveScan( Array& array,
                       typename Array::IndexType begin,
                       typename Array::IndexType end,
                       Reduction&& reduction,
-                      typename Array::ValueType zero )
+                      typename Array::ValueType identity )
 {
-   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
-                  "zero is not an idempotent value of the reduction operation" );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
-   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
  * \brief Overload of \ref inplaceInclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename Array::ValueType >()`.
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename Array::ValueType >()`.
  * See \ref inplaceInclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
  */
@@ -262,8 +268,8 @@ inplaceInclusiveScan( Array& array,
 {
    if( end == 0 )
       end = array.getSize();
-   constexpr typename Array::ValueType zero = Reduction::template getIdempotent< typename Array::ValueType >();
-   inplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename Array::ValueType identity = Reduction::template getIdentity< typename Array::ValueType >();
+   inplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
@@ -284,8 +290,9 @@ inplaceInclusiveScan( Array& array,
  * \param begin the first element in the array to be scanned
  * \param end the last element in the array to be scanned
  * \param reduction functor implementing the reduction operation
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
  *
  * The reduction functor takes two variables to be reduced:
  *
@@ -308,19 +315,20 @@ inplaceExclusiveScan( Array& array,
                       typename Array::IndexType begin,
                       typename Array::IndexType end,
                       Reduction&& reduction,
-                      typename Array::ValueType zero )
+                      typename Array::ValueType identity )
 {
-   TNL_ASSERT_EQ( reduction( zero, zero ), zero,
-                  "zero is not an idempotent value of the reduction operation" );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
    using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
-   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), zero );
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), identity );
 }
 
 /**
  * \brief Overload of \ref inplaceExclusiveScan which uses a TNL functional
  *        object for reduction. \ref TNL::Plus is used by default.
  *
- * The idempotent value is taken as `reduction.template getIdempotent< typename Array::ValueType >()`.
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename Array::ValueType >()`.
  * See \ref inplaceExclusiveScan for the explanation of other parameters.
  * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
  */
@@ -334,8 +342,8 @@ inplaceExclusiveScan( Array& array,
 {
    if( end == 0 )
       end = array.getSize();
-   constexpr typename Array::ValueType zero = Reduction::template getIdempotent< typename Array::ValueType >();
-   inplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), zero );
+   constexpr typename Array::ValueType identity = Reduction::template getIdentity< typename Array::ValueType >();
+   inplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 385096303..324448541 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -35,7 +35,7 @@ auto ExpressionMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Min{}, TNL::Min::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Min{}, TNL::Min::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -46,7 +46,7 @@ auto ExpressionArgMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MinWithArg{}, TNL::MinWithArg::template getIdempotent< ResultType >() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MinWithArg{}, TNL::MinWithArg::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -57,7 +57,7 @@ auto ExpressionMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Max{}, TNL::Max::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Max{}, TNL::Max::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -68,7 +68,7 @@ auto ExpressionArgMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MaxWithArg{}, TNL::MaxWithArg::template getIdempotent< ResultType >() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MaxWithArg{}, TNL::MaxWithArg::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -79,7 +79,7 @@ auto ExpressionSum( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Plus{}, TNL::Plus::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Plus{}, TNL::Plus::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -90,7 +90,7 @@ auto ExpressionProduct( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Multiplies{}, TNL::Multiplies::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Multiplies{}, TNL::Multiplies::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -101,7 +101,7 @@ auto ExpressionLogicalAnd( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalAnd{}, TNL::LogicalAnd::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalAnd{}, TNL::LogicalAnd::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -112,7 +112,7 @@ auto ExpressionLogicalOr( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalOr{}, TNL::LogicalOr::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalOr{}, TNL::LogicalOr::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -123,7 +123,7 @@ auto ExpressionBinaryAnd( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitAnd{}, TNL::BitAnd::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitAnd{}, TNL::BitAnd::template getIdentity< ResultType >() );
 }
 
 template< typename Expression >
@@ -134,7 +134,7 @@ auto ExpressionBinaryOr( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitOr{}, TNL::BitOr::template getIdempotent< ResultType >() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitOr{}, TNL::BitOr::template getIdentity< ResultType >() );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index d683ff639..b1f897433 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -23,7 +23,7 @@ namespace TNL {
 struct Plus : public std::plus< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 0; }
+   static constexpr T getIdentity() { return 0; }
 };
 
 /**
@@ -32,7 +32,7 @@ struct Plus : public std::plus< void >
 struct Multiplies : public std::multiplies< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 1; }
+   static constexpr T getIdentity() { return 1; }
 };
 
 /**
@@ -41,7 +41,7 @@ struct Multiplies : public std::multiplies< void >
 struct Min
 {
    template< typename T >
-   static constexpr T getIdempotent()
+   static constexpr T getIdentity()
    {
       static_assert( std::numeric_limits< T >::is_specialized,
                      "std::numeric_limits is not specialized for the requested type" );
@@ -63,7 +63,7 @@ struct Min
 struct Max
 {
    template< typename T >
-   static constexpr T getIdempotent()
+   static constexpr T getIdentity()
    {
       static_assert( std::numeric_limits< T >::is_specialized,
                      "std::numeric_limits is not specialized for the requested type" );
@@ -85,7 +85,7 @@ struct Max
 struct MinWithArg
 {
    template< typename T >
-   static constexpr T getIdempotent()
+   static constexpr T getIdentity()
    {
       static_assert( std::numeric_limits< T >::is_specialized,
                      "std::numeric_limits is not specialized for the requested type" );
@@ -113,7 +113,7 @@ struct MinWithArg
 struct MaxWithArg
 {
    template< typename T >
-   static constexpr T getIdempotent()
+   static constexpr T getIdentity()
    {
       static_assert( std::numeric_limits< T >::is_specialized,
                      "std::numeric_limits is not specialized for the requested type" );
@@ -141,7 +141,7 @@ struct MaxWithArg
 struct LogicalAnd : public std::logical_and< void >
 {
    template< typename T >
-   static constexpr T getIdempotent()
+   static constexpr T getIdentity()
    {
       static_assert( std::numeric_limits< T >::is_specialized,
                      "std::numeric_limits is not specialized for the requested type" );
@@ -155,7 +155,7 @@ struct LogicalAnd : public std::logical_and< void >
 struct LogicalOr : public std::logical_or< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 0; }
+   static constexpr T getIdentity() { return 0; }
 };
 
 /**
@@ -164,7 +164,7 @@ struct LogicalOr : public std::logical_or< void >
 struct BitAnd : public std::bit_and< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); }
+   static constexpr T getIdentity() { return ~static_cast< T >( 0 ); }
 };
 
 /**
@@ -173,7 +173,7 @@ struct BitAnd : public std::bit_and< void >
 struct BitOr : public std::bit_or< void >
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 0; }
+   static constexpr T getIdentity() { return 0; }
 };
 
 } // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 8d6e5d771..a65c12d80 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -701,7 +701,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceRows.cpp
@@ -709,7 +711,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -728,7 +730,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceRows.cpp
@@ -736,7 +740,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -753,7 +757,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceAllRows.cpp
@@ -761,7 +767,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
@@ -778,7 +784,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceAllRows.cpp
@@ -786,7 +794,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Computes product of matrix and vector.
@@ -964,7 +972,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the dense matrix.
  * \return  reference to the stream.
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index d330b8333..e3a975167 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -355,9 +355,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -368,9 +368,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -381,9 +381,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -394,9 +394,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -1375,7 +1375,7 @@ template< typename Real,
           ElementsOrganization Organization,
           typename RealAllocator >
 std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, Organization, RealAllocator >& matrix )
-{ 
+{
    matrix.print( str );
    return str;
 }
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index 89ace2d06..ea7f6dbe7 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -418,7 +418,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceRows.cpp
@@ -426,7 +428,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -445,7 +447,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceRows.cpp
@@ -453,7 +457,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -470,7 +474,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceAllRows.cpp
@@ -478,7 +484,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
@@ -495,7 +501,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceAllRows.cpp
@@ -503,7 +511,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 4a999d76b..6ad36f27a 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -290,14 +290,14 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
    auto values_view = this->values.getView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -307,14 +307,14 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
    const auto values_view = this->values.getConstView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -324,9 +324,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -336,9 +336,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 511942f01..01d3a0b91 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -388,7 +388,9 @@ class LambdaMatrix
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_reduceRows.cpp
@@ -396,7 +398,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -413,7 +415,9 @@ class LambdaMatrix
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_reduceAllRows.cpp
@@ -421,7 +425,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Computes product of matrix and vector.
diff --git a/src/TNL/Matrices/LambdaMatrix.hpp b/src/TNL/Matrices/LambdaMatrix.hpp
index f2cdb7574..20d2ccbb1 100644
--- a/src/TNL/Matrices/LambdaMatrix.hpp
+++ b/src/TNL/Matrices/LambdaMatrix.hpp
@@ -262,7 +262,7 @@ template< typename MatrixElementsLambda,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using FetchType = decltype( fetch( IndexType(), IndexType(), RealType() ) );
 
@@ -272,13 +272,13 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce,
    auto matrixElements = this->matrixElementsLambda;
    auto processRow = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       const IndexType rowLength = rowLengths( rows, columns, rowIdx );
-      FetchType result( zero );
+      FetchType result = identity;
       for( IndexType localIdx = 0; localIdx < rowLength; localIdx++ )
       {
         IndexType elementColumn( 0 );
         RealType elementValue( 0.0 );
         matrixElements( rows, columns, rowIdx, localIdx, elementColumn, elementValue );
-        FetchType fetchValue( zero );
+        FetchType fetchValue = identity;
         if( elementValue != 0.0 )
             fetchValue = fetch( rowIdx, elementColumn, elementValue );
         result = reduce( result, fetchValue );
@@ -296,9 +296,9 @@ template< typename MatrixElementsLambda,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename MatrixElementsLambda,
@@ -444,7 +444,7 @@ print( std::ostream& str ) const
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the lambda matrix.
  * \return reference to the stream.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index e29796a1e..d938a1062 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -47,7 +47,7 @@ namespace Matrices {
  * are \f$\{-3,-1,0,1,3\}\f$. Advantage is that we do not store the column indexes
  * explicitly as it is in \ref SparseMatrix. This can reduce significantly the
  * memory requirements which also means better performance. See the following table
- * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix 
+ * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix
  * and \ref TNL::Matrices::SparseMatrix.
  *
  *  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
@@ -614,7 +614,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceRows.cpp
@@ -622,7 +624,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -641,7 +643,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceRows.cpp
@@ -649,7 +653,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -666,7 +670,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceAllRows.cpp
@@ -674,7 +680,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -691,7 +697,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceAllRows.cpp
@@ -699,7 +707,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over matrix rows for constant instances.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index 7e6ac450f..99534b92b 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -477,9 +477,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -491,9 +491,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -505,9 +505,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -519,9 +519,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index bc3de664b..357560213 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -376,7 +376,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceRows.cpp
@@ -384,7 +386,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
@@ -403,7 +405,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceRows.cpp
@@ -411,7 +415,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -428,7 +432,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceAllRows.cpp
@@ -436,7 +442,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -453,7 +459,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceAllRows.cpp
@@ -461,7 +469,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 2b83fc87b..03bc6907e 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -356,7 +356,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
@@ -364,9 +364,8 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep&
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
@@ -385,7 +384,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
@@ -393,9 +392,8 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep&
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
@@ -414,9 +412,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -426,9 +424,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index b9b7dceae..237417d66 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -617,7 +617,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
@@ -625,7 +627,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -644,7 +646,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
@@ -652,7 +656,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -669,7 +673,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
@@ -677,7 +683,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -694,7 +700,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
@@ -702,7 +710,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for parallel iteration over matrix elements of given rows for constant instances.
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 21bf6d143..a183b38c1 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -537,9 +537,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -553,9 +553,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -569,9 +569,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -585,9 +585,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 8651ad1c3..40a89b628 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -408,7 +408,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
@@ -416,7 +418,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -435,7 +437,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
@@ -443,7 +447,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -460,7 +464,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
@@ -468,7 +474,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -485,7 +491,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
@@ -493,7 +501,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index ad2da0d4b..cf5e9771a 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -504,7 +504,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
@@ -518,9 +518,9 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          else
             return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       }
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -532,7 +532,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
@@ -547,9 +547,9 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          else
             return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       }
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -561,9 +561,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -575,9 +575,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index c970ff9b7..b74e0dcb9 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -506,7 +506,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceRows.cpp
@@ -514,7 +516,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows of constant matrix instances.
@@ -533,7 +535,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceRows.cpp
@@ -541,7 +545,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -560,7 +564,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceAllRows.cpp
@@ -568,7 +574,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows of constant matrix instances.
@@ -587,7 +593,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceAllRows.cpp
@@ -595,7 +603,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over matrix rows for constant instances.
diff --git a/src/TNL/Matrices/TridiagonalMatrix.hpp b/src/TNL/Matrices/TridiagonalMatrix.hpp
index 87a508a9c..1841df5c8 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrix.hpp
@@ -348,9 +348,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -361,9 +361,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -374,9 +374,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -387,9 +387,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.h b/src/TNL/Matrices/TridiagonalMatrixView.h
index be2926934..e05a8b059 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.h
+++ b/src/TNL/Matrices/TridiagonalMatrixView.h
@@ -363,7 +363,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceRows.cpp
@@ -371,7 +373,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
@@ -390,7 +392,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceRows.cpp
@@ -398,7 +402,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -415,7 +419,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceAllRows.cpp
@@ -423,7 +429,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -440,7 +446,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceAllRows.cpp
@@ -448,7 +456,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 5e7bfe756..cf510bf8c 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -279,14 +279,13 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       if( rowIdx == 0 )
       {
          sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
@@ -323,14 +322,13 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       if( rowIdx == 0 )
       {
          sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
@@ -367,9 +365,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -379,9 +377,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/UnitTests/Algorithms/SegmentedScanTest.h b/src/UnitTests/Algorithms/SegmentedScanTest.h
index 7f141fd72..4b467ca88 100644
--- a/src/UnitTests/Algorithms/SegmentedScanTest.h
+++ b/src/UnitTests/Algorithms/SegmentedScanTest.h
@@ -131,19 +131,19 @@ TYPED_TEST( SegmentedScanTest, inclusive )
    flags_copy = flags_view;
 
    v = 0;
-   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v.getElement( i ), 0 );
    flags_view = flags_copy;
 
    v = 1;
-   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v.getElement( i ), ( i % 5 ) + 1 );
    flags_view = flags_copy;
 
    setLinearSequence( v );
-   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 1; i < size; i++ )
    {
       if( flags.getElement( i ) )
@@ -154,19 +154,19 @@ TYPED_TEST( SegmentedScanTest, inclusive )
    flags_view = flags_copy;
 
    v_view = 0;
-   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_view.getElement( i ), 0 );
    flags_view = flags_copy;
 
    v_view = 1;
-   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 0; i < size; i++ )
       EXPECT_EQ( v_view.getElement( i ), ( i % 5 ) + 1 );
    flags_view = flags_copy;
 
    setLinearSequence( v );
-   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdempotent< ValueType >() );
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
    for( int i = 1; i < size; i++ )
    {
       if( flags.getElement( i ) )
-- 
GitLab


From 05903a8ff460a9457c5b9c6116d071f8a2307806 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 24 Jul 2021 12:14:35 +0200
Subject: [PATCH 38/52] Removed useless vertical operations and used RemoveET
 in reduce.h

---
 src/TNL/Algorithms/reduce.h                   |  13 +-
 .../DistributedVerticalOperations.h           |  42 ++----
 .../Expressions/ExpressionTemplates.h         |  24 +--
 .../Expressions/VerticalOperations.h          | 142 ------------------
 src/TNL/Functional.h                          |   4 +-
 5 files changed, 34 insertions(+), 191 deletions(-)
 delete mode 100644 src/TNL/Containers/Expressions/VerticalOperations.h

diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
index a9eeed612..a769a2e1a 100644
--- a/src/TNL/Algorithms/reduce.h
+++ b/src/TNL/Algorithms/reduce.h
@@ -16,6 +16,7 @@
 
 #include <TNL/Functional.h>  // extension of STL functionals for reduction
 #include <TNL/Algorithms/detail/Reduction.h>
+#include <TNL/Containers/Expressions/TypeTraits.h>  // RemoveET
 
 namespace TNL {
 namespace Algorithms {
@@ -131,7 +132,7 @@ auto reduce( const Index begin,
              Fetch&& fetch,
              Reduction&& reduction = TNL::Plus{} )
 {
-   using Result = std::decay_t< decltype( fetch( 0 ) ) >;
+   using Result = Containers::Expressions::RemoveET< decltype( reduction( fetch(0), fetch(0) ) ) >;
    return reduce< Device >( begin,
                             end,
                             std::forward< Fetch >( fetch ),
@@ -204,10 +205,10 @@ template< typename Array,
 auto reduce( const Array& array,
              Reduction&& reduction = TNL::Plus{} )
 {
-   using ValueType = typename Array::ValueType;
+   using Result = Containers::Expressions::RemoveET< decltype( reduction( array(0), array(0) ) ) >;
    return reduce< Array, Device >( array,
                                    std::forward< Reduction >( reduction ),
-                                   reduction.template getIdentity< ValueType >() );
+                                   reduction.template getIdentity< Result >() );
 }
 
 /**
@@ -329,7 +330,7 @@ reduceWithArgument( const Index begin,
                     Fetch&& fetch,
                     Reduction&& reduction )
 {
-   using Result = std::decay_t< decltype( fetch( 0 ) ) >;
+   using Result = Containers::Expressions::RemoveET< decltype( fetch(0) ) >;
    return reduceWithArgument< Device >( begin,
                                         end,
                                         std::forward< Fetch >( fetch ),
@@ -400,10 +401,10 @@ template< typename Array,
 auto reduceWithArgument( const Array& array,
                          Reduction&& reduction )
 {
-   using ValueType = typename Array::ValueType;
+   using Result = Containers::Expressions::RemoveET< decltype( array(0) ) >;
    return reduceWithArgument< Array, Device >( array,
                                                std::forward< Reduction >( reduction ),
-                                               reduction.template getIdentity< ValueType >() );
+                                               reduction.template getIdentity< Result >() );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index e1f850013..37bf2c868 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -10,8 +10,8 @@
 
 #pragma once
 
-#include <TNL/Containers/Expressions/VerticalOperations.h>
 #include <TNL/MPI/Wrappers.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Containers {
@@ -26,7 +26,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Min{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
    }
    return result;
@@ -46,7 +46,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
    const auto group = expression.getCommunicationGroup();
    if( group != MPI::NullGroup() ) {
       // compute local argMin
-      ResultType localResult = ExpressionArgMin( expression.getConstLocalView() );
+      ResultType localResult = Algorithms::reduceWithArgument( expression.getConstLocalView(), TNL::MinWithArg{} );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
@@ -62,15 +62,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
       auto fetch = [_data] ( IndexType i ) { return _data[ i ].first; };
-      auto reduction = [] ( RealType& a, const RealType& b, IndexType& aIdx, const IndexType& bIdx ) {
-         if( a > b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, TNL::MinWithArg{} );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -85,7 +77,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::lowest();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Max{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
    }
    return result;
@@ -105,7 +97,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
    const auto group = expression.getCommunicationGroup();
    if( group != MPI::NullGroup() ) {
       // compute local argMax
-      ResultType localResult = ExpressionArgMax( expression.getConstLocalView() );
+      ResultType localResult = Algorithms::reduceWithArgument( expression.getConstLocalView(), TNL::MaxWithArg{} );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
@@ -121,15 +113,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
       auto fetch = [_data] ( IndexType i ) { return _data[ i ].first; };
-      auto reduction = [] ( RealType& a, const RealType& b, IndexType& aIdx, const IndexType& bIdx ) {
-         if( a < b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, TNL::MaxWithArg{} );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -142,7 +126,7 @@ auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< d
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionSum( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Plus{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
    }
    return result;
@@ -155,7 +139,7 @@ auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_
 
    ResultType result = 1;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionProduct( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Multiplies{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
    }
    return result;
@@ -170,7 +154,7 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::LogicalAnd{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
    }
    return result;
@@ -183,7 +167,7 @@ auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::deca
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::LogicalOr{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
    }
    return result;
@@ -198,7 +182,7 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitAnd{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
    }
    return result;
@@ -211,7 +195,7 @@ auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitOr{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
    }
    return result;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 6e9ae794e..3e01255fc 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -18,7 +18,7 @@
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
 #include <TNL/Containers/Expressions/Comparison.h>
 #include <TNL/Containers/Expressions/HorizontalOperations.h>
-#include <TNL/Containers/Expressions/VerticalOperations.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Containers {
@@ -370,7 +370,7 @@ template< typename ET1, typename ET2,
 auto
 operator,( const ET1& a, const ET2& b )
 {
-   return ExpressionSum( a * b );
+   return Algorithms::reduce( a * b, TNL::Plus{} );
 }
 
 template< typename ET1, typename ET2,
@@ -662,7 +662,7 @@ template< typename ET1,
 auto
 min( const ET1& a )
 {
-   return ExpressionMin( a );
+   return Algorithms::reduce( a, TNL::Min{} );
 }
 
 template< typename ET1,
@@ -670,7 +670,7 @@ template< typename ET1,
 auto
 argMin( const ET1& a )
 {
-   return ExpressionArgMin( a );
+   return Algorithms::reduceWithArgument( a, TNL::MinWithArg{} );
 }
 
 template< typename ET1,
@@ -678,7 +678,7 @@ template< typename ET1,
 auto
 max( const ET1& a )
 {
-   return ExpressionMax( a );
+   return Algorithms::reduce( a, TNL::Max{} );
 }
 
 template< typename ET1,
@@ -686,7 +686,7 @@ template< typename ET1,
 auto
 argMax( const ET1& a )
 {
-   return ExpressionArgMax( a );
+   return Algorithms::reduceWithArgument( a, TNL::MaxWithArg{} );
 }
 
 template< typename ET1,
@@ -694,7 +694,7 @@ template< typename ET1,
 auto
 sum( const ET1& a )
 {
-   return ExpressionSum( a );
+   return Algorithms::reduce( a, TNL::Plus{} );
 }
 
 template< typename ET1,
@@ -743,7 +743,7 @@ template< typename ET1,
 auto
 product( const ET1& a )
 {
-   return ExpressionProduct( a );
+   return Algorithms::reduce( a, TNL::Multiplies{} );
 }
 
 template< typename ET1,
@@ -751,7 +751,7 @@ template< typename ET1,
 auto
 logicalAnd( const ET1& a )
 {
-   return ExpressionLogicalAnd( a );
+   return Algorithms::reduce( a, TNL::LogicalAnd{} );
 }
 
 template< typename ET1,
@@ -759,7 +759,7 @@ template< typename ET1,
 auto
 logicalOr( const ET1& a )
 {
-   return ExpressionLogicalOr( a );
+   return Algorithms::reduce( a, TNL::LogicalOr{} );
 }
 
 template< typename ET1,
@@ -767,7 +767,7 @@ template< typename ET1,
 auto
 binaryAnd( const ET1& a )
 {
-   return ExpressionBinaryAnd( a );
+   return Algorithms::reduce( a, TNL::BitAnd{} );
 }
 
 template< typename ET1,
@@ -775,7 +775,7 @@ template< typename ET1,
 auto
 binaryOr( const ET1& a )
 {
-   return ExpressionBinaryOr( a );
+   return Algorithms::reduce( a, TNL::BitOr{} );
 }
 
 #endif // DOXYGEN_ONLY
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
deleted file mode 100644
index 324448541..000000000
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/***************************************************************************
-                          VerticalOperations.h  -  description
-                             -------------------
-    begin                : May 1, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <limits>
-#include <type_traits>
-
-#include <TNL/Functional.h>
-#include <TNL/Algorithms/reduce.h>
-#include <TNL/Containers/Expressions/TypeTraits.h>
-
-////
-// By vertical operations we mean those applied across vector elements or
-// vector expression elements. It means for example minim/maximum of all
-// vector elements etc.
-namespace TNL {
-namespace Containers {
-namespace Expressions {
-
-////
-// Vertical operations
-template< typename Expression >
-auto ExpressionMin( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Min{}, TNL::Min::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionArgMin( const Expression& expression )
--> RemoveET< std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MinWithArg{}, TNL::MinWithArg::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionMax( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Max{}, TNL::Max::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionArgMax( const Expression& expression )
--> RemoveET< std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::MaxWithArg{}, TNL::MaxWithArg::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionSum( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] + expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] + expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Plus{}, TNL::Plus::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionProduct( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] * expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] * expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::Multiplies{}, TNL::Multiplies::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionLogicalAnd( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] && expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] && expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalAnd{}, TNL::LogicalAnd::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionLogicalOr( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] || expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] || expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::LogicalOr{}, TNL::LogicalOr::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionBinaryAnd( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] & expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] & expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitAnd{}, TNL::BitAnd::template getIdentity< ResultType >() );
-}
-
-template< typename Expression >
-auto ExpressionBinaryOr( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] | expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] | expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), view, TNL::BitOr{}, TNL::BitOr::template getIdentity< ResultType >() );
-}
-
-} // namespace Expressions
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index b1f897433..793609c53 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -80,7 +80,7 @@ struct Max
 };
 
 /**
- * \brief Extension of \ref std::min<void> for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Function object implementing `argmin(x, y, i, j)` for use with \ref TNL::Algorithms::reduceWithArgument.
  */
 struct MinWithArg
 {
@@ -108,7 +108,7 @@ struct MinWithArg
 };
 
 /**
- * \brief Extension of \ref std::max<void> for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Function object implementing `argmax(x, y, i, j)` for use with \ref TNL::Algorithms::reduceWithArgument.
  */
 struct MaxWithArg
 {
-- 
GitLab


From 4bbf495c41d9735010214fe21955118ed82591a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 29 Jul 2021 10:04:10 +0200
Subject: [PATCH 39/52] Added specialization of CudaBlockReduce using __shfl
 instructions

---
 .../Algorithms/detail/CudaReductionKernel.h   | 283 +++++++++++++-----
 src/TNL/Algorithms/detail/CudaScanKernel.h    |   2 +-
 2 files changed, 212 insertions(+), 73 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 60100c628..92f54fc1b 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -74,6 +74,8 @@ struct CudaBlockReduce
     * result of the reduction
     *
     * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
     * \param threadValue Value of the calling thread to be reduced.
     * \param tid         Index of the calling thread (usually `threadIdx.x`,
     *                    unless you know what you are doing).
@@ -83,6 +85,7 @@ struct CudaBlockReduce
    __device__ static
    ValueType
    reduce( const Reduction& reduction,
+           ValueType identity,
            ValueType threadValue,
            int tid,
            Storage& storage )
@@ -141,6 +144,137 @@ struct CudaBlockReduce
    }
 };
 
+template< int blockSize,
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockReduceShfl
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
+
+   /* Cooperative reduction across the CUDA block - each thread will get the
+    * result of the reduction
+    *
+    * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   ValueType
+   reduce( const Reduction& reduction,
+           ValueType identity,
+           ValueType threadValue,
+           int tid,
+           Storage& storage )
+   {
+      // verify the configuration
+      static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
+                     "blockSize is too large, it would not be possible to reduce warpResults using one warp" );
+
+      int lane_id = threadIdx.x % warpSize;
+      int warp_id = threadIdx.x / warpSize;
+
+      // perform the parallel reduction across warps
+      threadValue = warpReduce( reduction, threadValue );
+
+      // the first thread of each warp writes the result into the shared memory
+      if( lane_id == 0 )
+         storage.warpResults[ warp_id ] = threadValue;
+      __syncthreads();
+
+      // the first warp performs the final reduction
+      if( warp_id == 0 ) {
+         // read from shared memory only if that warp existed
+         if( tid < blockSize / Cuda::getWarpSize() )
+            threadValue = storage.warpResults[ lane_id ];
+         else
+            threadValue = identity;
+         threadValue = warpReduce( reduction, threadValue );
+      }
+
+      // the first thread writes the result into the shared memory
+      if( tid == 0 )
+         storage.warpResults[ 0 ] = threadValue;
+
+      __syncthreads();
+      return storage.warpResults[ 0 ];
+   }
+
+   /* Helper function.
+    * Cooperative reduction across the warp - each thread will get the result
+    * of the reduction
+    */
+   __device__ static
+   ValueType
+   warpReduce( const Reduction& reduction,
+               ValueType threadValue )
+   {
+      constexpr unsigned mask = 0xffffffff;
+      #pragma unroll
+      for( int i = Cuda::getWarpSize() / 2; i > 0; i /= 2 ) {
+         const ValueType otherValue = __shfl_xor_sync( mask, threadValue, i );
+         threadValue = reduction( threadValue, otherValue );
+      }
+      return threadValue;
+   }
+};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, int >
+: public CudaBlockReduceShfl< blockSize, Reduction, int >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned int >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned int >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, long >
+: public CudaBlockReduceShfl< blockSize, Reduction, long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned long >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, long long >
+: public CudaBlockReduceShfl< blockSize, Reduction, long long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned long long >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned long long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, float >
+: public CudaBlockReduceShfl< blockSize, Reduction, float >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, double >
+: public CudaBlockReduceShfl< blockSize, Reduction, double >
+{};
+
 /* Template for cooperative reduction with argument across the CUDA block of
  * threads. It is a *cooperative* operation - all threads must call the
  * operation, otherwise it will deadlock!
@@ -169,6 +303,8 @@ struct CudaBlockReduceWithArgument
     * will get the pair of the result of the reduction and the index
     *
     * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
     * \param threadValue Value of the calling thread to be reduced.
     * \param threadIndex Index value of the calling thread to be reduced.
     * \param tid         Index of the calling thread (usually `threadIdx.x`,
@@ -179,6 +315,7 @@ struct CudaBlockReduceWithArgument
    __device__ static
    std::pair< ValueType, IndexType >
    reduceWithArgument( const Reduction& reduction,
+                       ValueType identity,
                        ValueType threadValue,
                        IndexType threadIndex,
                        int tid,
@@ -258,15 +395,15 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
 #endif
 
 template< int blockSize,
-          typename Result,
           typename DataFetcher,
           typename Reduction,
+          typename Result,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( Result initialValue,
-                     DataFetcher dataFetcher,
+CudaReductionKernel( DataFetcher dataFetcher,
                      const Reduction reduction,
+                     Result identity,
                      Index begin,
                      Index end,
                      Result* output )
@@ -283,42 +420,43 @@ CudaReductionKernel( Result initialValue,
    begin += blockIdx.x * blockDim.x + threadIdx.x;
 
    // Start with the sequential reduction and push the result into the shared memory.
+   Result result = identity;
    while( begin + 4 * gridSize < end ) {
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 2 * gridSize ) );
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + 3 * gridSize ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + gridSize ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + 2 * gridSize ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + 3 * gridSize ) );
       begin += 4 * gridSize;
    }
    while( begin + 2 * gridSize < end ) {
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin + gridSize ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + gridSize ) );
       begin += 2 * gridSize;
    }
    while( begin < end ) {
-      initialValue = CudaReductionFunctorWrapper( reduction, initialValue, dataFetcher( begin ) );
+      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
       begin += gridSize;
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   initialValue = BlockReduce::reduce( reduction, initialValue, threadIdx.x, storage );
+   result = BlockReduce::reduce( reduction, identity, result, threadIdx.x, storage );
 
    // Store the result back in the global memory.
    if( threadIdx.x == 0 )
-      output[ blockIdx.x ] = initialValue;
+      output[ blockIdx.x ] = result;
 }
 
 template< int blockSize,
-          typename Result,
           typename DataFetcher,
           typename Reduction,
+          typename Result,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionWithArgumentKernel( Result initialValue,
-                                 DataFetcher dataFetcher,
+CudaReductionWithArgumentKernel( DataFetcher dataFetcher,
                                  const Reduction reduction,
+                                 Result identity,
                                  Index begin,
                                  Index end,
                                  Result* output,
@@ -340,61 +478,62 @@ CudaReductionWithArgumentKernel( Result initialValue,
    Index initialIndex;
 
    // Start with the sequential reduction and push the result into the shared memory.
+   Result result = identity;
    if( idxInput ) {
       if( begin < end ) {
-         initialValue = dataFetcher( begin );
+         result = dataFetcher( begin );
          initialIndex = idxInput[ begin ];
          begin += gridSize;
       }
       while( begin + 4 * gridSize < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
-         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
-         reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, idxInput[ begin + 2 * gridSize ] );
-         reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, idxInput[ begin + 3 * gridSize ] );
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         reduction( result, dataFetcher( begin + 2 * gridSize ), initialIndex, idxInput[ begin + 2 * gridSize ] );
+         reduction( result, dataFetcher( begin + 3 * gridSize ), initialIndex, idxInput[ begin + 3 * gridSize ] );
          begin += 4 * gridSize;
       }
       while( begin + 2 * gridSize < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
-         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
          begin += 2 * gridSize;
       }
       while( begin < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
          begin += gridSize;
       }
    }
    else {
       if( begin < end ) {
-         initialValue = dataFetcher( begin );
+         result = dataFetcher( begin );
          initialIndex = begin;
          begin += gridSize;
       }
       while( begin + 4 * gridSize < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
-         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
-         reduction( initialValue, dataFetcher( begin + 2 * gridSize ), initialIndex, begin + 2 * gridSize );
-         reduction( initialValue, dataFetcher( begin + 3 * gridSize ), initialIndex, begin + 3 * gridSize );
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         reduction( result, dataFetcher( begin + 2 * gridSize ), initialIndex, begin + 2 * gridSize );
+         reduction( result, dataFetcher( begin + 3 * gridSize ), initialIndex, begin + 3 * gridSize );
          begin += 4 * gridSize;
       }
       while( begin + 2 * gridSize < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
-         reduction( initialValue, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
          begin += 2 * gridSize;
       }
       while( begin < end ) {
-         reduction( initialValue, dataFetcher( begin ), initialIndex, begin );
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
          begin += gridSize;
       }
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   const std::pair< Result, Index > result = BlockReduce::reduceWithArgument( reduction, initialValue, initialIndex, threadIdx.x, storage );
+   const std::pair< Result, Index > result_pair = BlockReduce::reduceWithArgument( reduction, identity, result, initialIndex, threadIdx.x, storage );
 
    // Store the result back in the global memory.
    if( threadIdx.x == 0 ) {
-      output[ blockIdx.x ] = result.first;
-      idxOutput[ blockIdx.x ] = result.second;
+      output[ blockIdx.x ] = result_pair.first;
+      idxOutput[ blockIdx.x ] = result_pair.second;
    }
 }
 #endif
@@ -550,55 +689,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionKernel< 512 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 256, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 256 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 128, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 128 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  64, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  64 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  32, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  32 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  16, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  16 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   8, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   8 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   4, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   4 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   2, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   2 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -611,11 +750,11 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
             // shared memory is allocated statically inside the kernel
             CudaReductionKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, begin, end, output);
+            <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, begin, end, output);
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
@@ -655,55 +794,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionWithArgumentKernel< 512 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 256 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 128 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  64 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  32 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  16 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   8 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   4 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   2 >
-               <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -716,11 +855,11 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
             // shared memory is allocated statically inside the kernel
             CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( identity, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
+            <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, begin, end, output, idxOutput, idxInput );
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index eee63717b..b5a90ced3 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -312,7 +312,7 @@ CudaScanKernelUpsweep( const InputView input,
    __syncthreads();
 
    // Perform the parallel reduction.
-   value = BlockReduce::reduce( reduction, value, threadIdx.x, storage.blockReduceStorage );
+   value = BlockReduce::reduce( reduction, identity, value, threadIdx.x, storage.blockReduceStorage );
 
    // Store the block result in the global memory.
    if( threadIdx.x == 0 )
-- 
GitLab


From 1a64a618ed851cf2eb02a2b60423b6dd044c47f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 29 Jul 2021 15:10:25 +0200
Subject: [PATCH 40/52] Added specialization of CudaBlockScan using __shfl
 instructions

---
 src/TNL/Algorithms/Sorting/detail/reduction.h | 138 -------------
 src/TNL/Algorithms/detail/CudaScanKernel.h    | 188 ++++++++++++++++--
 2 files changed, 173 insertions(+), 153 deletions(-)
 delete mode 100644 src/TNL/Algorithms/Sorting/detail/reduction.h

diff --git a/src/TNL/Algorithms/Sorting/detail/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h
deleted file mode 100644
index e2bf14809..000000000
--- a/src/TNL/Algorithms/Sorting/detail/reduction.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/***************************************************************************
-                          reduction.h  -  description
-                             -------------------
-    begin                : Jul 13, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Xuan Thang Nguyen
-
-#pragma once
-
-namespace TNL {
-    namespace Algorithms {
-        namespace Sorting {
-
-#ifdef HAVE_CUDA
-
-/**
- * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
- * */
-
-
-__device__ int warpReduceSum(int initVal)
-{
-    const unsigned int maskConstant = 0xffffffff; //not used
-    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
-        initVal += __shfl_xor_sync(maskConstant, initVal, mask);
-
-    return initVal;
-}
-
-__device__ int blockReduceSum(int val)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    val = warpReduceSum(val);
-
-    if (lane == 0)
-        shared[wid] = val;
-    __syncthreads();
-
-    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
-
-    if (wid == 0)
-        val = warpReduceSum(val);
-
-    if(threadIdx.x == 0)
-        shared[0] = val;
-    __syncthreads();
-
-    return shared[0];
-}
-
-//-------------------------------------------------------------------------------
-
-__device__ int warpInclusivePrefixSum(int value)
-{
-    int laneId = threadIdx.x & (32-1);
-
-    #pragma unroll
-    for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size
-    {
-        int n = __shfl_up_sync(0xffffffff, value, i);
-        if ((laneId & (warpSize - 1)) >= i)
-            value += n;
-    }
-
-    return value;
-}
-
-__device__ int blockInclusivePrefixSum(int value)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    int tmp = warpInclusivePrefixSum(value);
-
-    if (lane == warpSize-1)
-        shared[wid] = tmp;
-    __syncthreads();
-
-    int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
-    if (wid == 0)
-        shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2;
-    __syncthreads();
-
-    tmp += shared[wid];
-    return tmp;
-}
-
-//--------------------------------------------------------------------
-
-template<typename Operator>
-__device__ int warpCmpReduce(int initVal, const Operator & Cmp)
-{
-    const unsigned int maskConstant = 0xffffffff; //not used
-    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
-        initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask));
-
-    return initVal;
-}
-
-template<typename Operator>
-__device__ int blockCmpReduce(int val, const Operator & Cmp)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    val = warpCmpReduce(val, Cmp);
-
-    if (lane == 0)
-        shared[wid] = val;
-    __syncthreads();
-
-    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0];
-
-    if (wid == 0)
-        val = warpCmpReduce(val, Cmp);
-
-    if(threadIdx.x == 0)
-        shared[0] = val;
-    __syncthreads();
-
-    return shared[0];
-}
-
-#endif
-
-        } // namespace Sorting
-    } // namespace Algorithms
-} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index b5a90ced3..0c170450c 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -68,43 +68,43 @@ struct CudaBlockScan
       static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
                      "blockSize is too large, it would not be possible to scan warpResults using one warp" );
 
-      // Store the threadValue in the shared memory.
+      // store the threadValue in the shared memory
       const int chunkResultIdx = Cuda::getInterleaving( tid );
       storage.chunkResults[ chunkResultIdx ] = threadValue;
       __syncthreads();
 
-      // Perform the parallel scan on chunkResults inside warps.
-      const int threadInWarpIdx = tid % Cuda::getWarpSize();
-      const int warpIdx = tid / Cuda::getWarpSize();
+      // perform the parallel scan on chunkResults inside warps
+      const int lane_id = tid % Cuda::getWarpSize();
+      const int warp_id = tid / Cuda::getWarpSize();
       #pragma unroll
       for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
-         if( threadInWarpIdx >= stride ) {
+         if( lane_id >= stride ) {
             storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
          }
          __syncwarp();
       }
       threadValue = storage.chunkResults[ chunkResultIdx ];
 
-      // The last thread in warp stores the intermediate result in warpResults.
-      if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
-         storage.warpResults[ warpIdx ] = threadValue;
+      // the last thread in warp stores the intermediate result in warpResults
+      if( lane_id == Cuda::getWarpSize() - 1 )
+         storage.warpResults[ warp_id ] = threadValue;
       __syncthreads();
 
-      // Perform the scan of warpResults using one warp.
-      if( warpIdx == 0 )
+      // perform the scan of warpResults using one warp
+      if( warp_id == 0 )
          #pragma unroll
          for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
-            if( threadInWarpIdx >= stride )
+            if( lane_id >= stride )
                storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
             __syncwarp();
          }
       __syncthreads();
 
-      // Shift threadValue by the warpResults.
-      if( warpIdx > 0 )
-         threadValue = reduction( threadValue, storage.warpResults[ warpIdx - 1 ] );
+      // shift threadValue by the warpResults
+      if( warp_id > 0 )
+         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );
 
-      // Shift the result for exclusive scan.
+      // shift the result for exclusive scan
       if( scanType == ScanType::Exclusive ) {
          storage.chunkResults[ chunkResultIdx ] = threadValue;
          __syncthreads();
@@ -116,6 +116,164 @@ struct CudaBlockScan
    }
 };
 
+template< ScanType scanType,
+          int __unused,  // the __shfl implementation does not depend on the blockSize
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockScanShfl
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
+
+   /* Cooperative scan across the CUDA block - each thread will get the
+    * result of the scan according to its ID.
+    *
+    * \param reduction    The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue  Value of the calling thread to be reduced.
+    * \param tid          Index of the calling thread (usually `threadIdx.x`,
+    *                     unless you know what you are doing).
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
+    */
+   __device__ static
+   ValueType
+   scan( const Reduction& reduction,
+         ValueType identity,
+         ValueType threadValue,
+         int tid,
+         Storage& storage )
+   {
+      const int lane_id = tid % Cuda::getWarpSize();
+      const int warp_id = tid / Cuda::getWarpSize();
+
+      // perform the parallel scan across warps
+      ValueType total;
+      threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total );
+
+      // the last thread in warp stores the result of inclusive scan in warpResults
+      if( lane_id == Cuda::getWarpSize() - 1 )
+         storage.warpResults[ warp_id ] = total;
+      __syncthreads();
+
+      // the first warp performs the scan of warpResults
+      if( warp_id == 0 ) {
+         // read from shared memory only if that warp existed
+         if( tid < blockDim.x / Cuda::getWarpSize() )
+            total = storage.warpResults[ lane_id ];
+         else
+            total = identity;
+         storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total );
+      }
+      __syncthreads();
+
+      // shift threadValue by the warpResults
+      if( warp_id > 0 )
+         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );
+
+      __syncthreads();
+      return threadValue;
+   }
+
+   /* Helper function.
+    * Cooperative scan across the warp - each thread will get the result of the
+    * scan according to its ID.
+    * return value = thread's result of the *warpScanType* scan
+    * total = thread's result of the *inclusive* scan
+    */
+   template< ScanType warpScanType >
+   __device__ static
+   ValueType
+   warpScan( const Reduction& reduction,
+             ValueType identity,
+             ValueType threadValue,
+             int lane_id,
+             ValueType& total )
+   {
+      constexpr unsigned mask = 0xffffffff;
+
+      // perform an inclusive scan
+      #pragma unroll
+      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
+         const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride );
+         if( lane_id >= stride )
+            threadValue = reduction( threadValue, otherValue );
+      }
+
+      // set the result of the inclusive scan
+      total = threadValue;
+
+      // shift the result for exclusive scan
+      if( warpScanType == ScanType::Exclusive ) {
+         threadValue = __shfl_up_sync( mask, threadValue, 1 );
+         if( lane_id == 0 )
+            threadValue = identity;
+      }
+
+      return threadValue;
+   }
+};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, int >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, int >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, long long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, long long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, float >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, float >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, double >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, double >
+{};
+
 /* Template for cooperative scan of a data tile in the global memory.
  * It is a *cooperative* operation - all threads must call the operation,
  * otherwise it will deadlock!
-- 
GitLab


From f0926de3f2a12729628e07efd2cd5bab3f110a43 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 28 Jul 2021 19:30:32 +0200
Subject: [PATCH 41/52] Refactored reduction and scan in quicksort
 implementation

---
 .../Algorithms/Sorting/detail/Quicksorter.h   |  3 +--
 .../Algorithms/Sorting/detail/Quicksorter.hpp | 19 ++++---------------
 .../Algorithms/Sorting/detail/cudaPartition.h |  9 ++++++---
 .../Sorting/detail/quicksort_1Block.h         |  9 +++++----
 .../Sorting/detail/quicksort_kernel.h         |  6 ++----
 5 files changed, 18 insertions(+), 28 deletions(-)

diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
index 0b97bd7c6..0a52b52fa 100644
--- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
@@ -13,7 +13,6 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
 
 namespace TNL {
@@ -94,7 +93,7 @@ class Quicksorter< Value, Devices::Cuda >
       Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount;  //is in reality 1 integer each
 
       Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping;
-      Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem;
+      Containers::Array<int, Devices::Cuda> cuda_reductionTaskInitMem;
 
       int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0;
       int iteration = 0;
diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
index da3f3a4fb..b775c9b77 100644
--- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
@@ -17,7 +17,8 @@
 #include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h>
 #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
 #include <TNL/Algorithms/Sorting/detail/Quicksorter.h>
-#include <TNL/Algorithms/Scan.h>
+#include <TNL/Algorithms/reduce.h>
+#include <TNL/Algorithms/scan.h>
 
 namespace TNL {
     namespace Algorithms {
@@ -314,8 +315,7 @@ int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::C
         int size = task.partitionEnd - task.partitionBegin;
         return size / elemPerBlock + (size % elemPerBlock != 0);
     };
-    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
-    return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 );
+    return reduce< Devices::Cuda >( 0, quicksort.host_1stPhaseTasksAmount, fetch, TNL::Plus{} );
 }
 
 template< typename Value >
@@ -323,14 +323,6 @@ int
 Quicksorter< Value, Devices::Cuda >::
 getSetsNeeded(int elemPerBlock) const
 {
-    /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
-    auto fetch = [=] __cuda_callable__(int i) {
-        const auto &task = view[i];
-        int size = task.partitionEnd - task.partitionBegin;
-        return size / elemPerBlock + (size % elemPerBlock != 0);
-    };
-    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
-    return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/
     return getSetsNeededFunction< Value >( elemPerBlock, *this );
 }
 
@@ -372,10 +364,7 @@ initTasks(int elemPerBlock, const CMP &Cmp)
                                                       cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
     //cuda_reductionTaskInitMem[i] == how many blocks task i needs
 
-    //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; };
-
-    Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >::
-        perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0);
+    inplaceInclusiveScan(cuda_reductionTaskInitMem);
     //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need
 
     int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
diff --git a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
index a6afaa20a..5277cc4d3 100644
--- a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
+++ b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
@@ -13,8 +13,8 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -185,8 +185,11 @@ __device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src,
     int smaller = 0, bigger = 0;
     countElem(srcView, Cmp, smaller, bigger, pivot);
 
-    int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
-    int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+    //synchronization is in this function already
+    using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
+    __shared__ typename BlockScan::Storage storage;
+    int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
+    int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );
 
     if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values
     {
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
index efca29f24..0ed8efa4d 100644
--- a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
@@ -15,8 +15,7 @@
 #include <TNL/Containers/Array.h>
 #include "cassert"
 #include <TNL/Algorithms/Sorting/detail/bitonicSort.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
-#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 
 namespace TNL {
     namespace Algorithms {
@@ -134,8 +133,10 @@ __device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices:
         countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot);
 
         //synchronization is in this function already
-        int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
-        int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+        using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
+        __shared__ typename BlockScan::Storage storage;
+        int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
+        int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );
 
         if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src
         {
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
index 8d26d0637..555e6c538 100644
--- a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
@@ -13,8 +13,6 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
 #include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
 #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
@@ -33,7 +31,7 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n
 //-----------------------------------------------------------
 
 __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
-                                     Containers::VectorView<int, Devices::Cuda> blocksNeeded)
+                                     Containers::ArrayView<int, Devices::Cuda> blocksNeeded)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= cuda_tasks.getSize())
@@ -49,7 +47,7 @@ __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda>
 template <typename Value, typename CMP>
 __global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks,
                              Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
-                             Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem,
+                             Containers::ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
                              Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp)
 {
     if (blockIdx.x >= cuda_tasks.getSize())
-- 
GitLab


From 0d2a57812d7f05d5ff1273121f2074356877ac91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sat, 31 Jul 2021 15:13:49 +0200
Subject: [PATCH 42/52] Removed old workaround for nvcc from
 CudaReductionKernel

---
 .../Algorithms/detail/CudaReductionKernel.h   | 58 ++++++-------------
 1 file changed, 17 insertions(+), 41 deletions(-)

diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 92f54fc1b..126884eaf 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -24,30 +24,6 @@ namespace Algorithms {
 namespace detail {
 
 #ifdef HAVE_CUDA
-/*
- * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
- * std::plus<void>::operator() function to be constexpr and hence __host__ __device__
- * (for example, when the arguments are StaticVector<3, double> etc). Hence, we use
- * this wrapper which triggers only a warning and not an error as is the case when
- * the reduction functor is called from a __global__ or __device__ function. Let's
- * hope it works otherwise...
- */
-template< typename Reduction, typename Arg1, typename Arg2 >
-__host__ __device__
-auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 )
-{
-// let's suppress the aforementioned warning...
-#ifdef __NVCC__
-#pragma push
-#pragma diag_suppress 2979  // error number for nvcc 10.2
-#pragma diag_suppress 3123  // error number for nvcc 11.1
-#endif
-   return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) );
-#ifdef __NVCC__
-#pragma pop
-#endif
-}
-
 /* Template for cooperative reduction across the CUDA block of threads.
  * It is a *cooperative* operation - all threads must call the operation,
  * otherwise it will deadlock!
@@ -95,48 +71,48 @@ struct CudaBlockReduce
 
       if( blockSize >= 1024 ) {
          if( tid < 512 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 512 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 512 ] );
          __syncthreads();
       }
       if( blockSize >= 512 ) {
          if( tid < 256 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 256 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 256 ] );
          __syncthreads();
       }
       if( blockSize >= 256 ) {
          if( tid < 128 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 128 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 128 ] );
          __syncthreads();
       }
       if( blockSize >= 128 ) {
          if( tid <  64 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 64 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 64 ] );
          __syncthreads();
       }
 
       // This runs in one warp so we use __syncwarp() instead of __syncthreads().
       if( tid < 32 ) {
          if( blockSize >= 64 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 32 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 32 ] );
          __syncwarp();
          // Note that here we do not have to check if tid < 16 etc, because we have
          // 2 * blockSize.x elements of shared memory per block, so we do not
          // access out of bounds. The results for the upper half will be undefined,
          // but unused anyway.
          if( blockSize >= 32 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 16 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 16 ] );
          __syncwarp();
          if( blockSize >= 16 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 8 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 8 ] );
          __syncwarp();
          if( blockSize >=  8 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 4 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 4 ] );
          __syncwarp();
          if( blockSize >=  4 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 2 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 2 ] );
          __syncwarp();
          if( blockSize >=  2 )
-            storage.data[ tid ] = CudaReductionFunctorWrapper( reduction, storage.data[ tid ], storage.data[ tid + 1 ] );
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 1 ] );
       }
 
       __syncthreads();
@@ -422,19 +398,19 @@ CudaReductionKernel( DataFetcher dataFetcher,
    // Start with the sequential reduction and push the result into the shared memory.
    Result result = identity;
    while( begin + 4 * gridSize < end ) {
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + gridSize ) );
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + 2 * gridSize ) );
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + 3 * gridSize ) );
+      result = reduction( result, dataFetcher( begin ) );
+      result = reduction( result, dataFetcher( begin + gridSize ) );
+      result = reduction( result, dataFetcher( begin + 2 * gridSize ) );
+      result = reduction( result, dataFetcher( begin + 3 * gridSize ) );
       begin += 4 * gridSize;
    }
    while( begin + 2 * gridSize < end ) {
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin + gridSize ) );
+      result = reduction( result, dataFetcher( begin ) );
+      result = reduction( result, dataFetcher( begin + gridSize ) );
       begin += 2 * gridSize;
    }
    while( begin < end ) {
-      result = CudaReductionFunctorWrapper( reduction, result, dataFetcher( begin ) );
+      result = reduction( result, dataFetcher( begin ) );
       begin += gridSize;
    }
    __syncthreads();
-- 
GitLab


From 71ac38b23fe7ee03770ae567ef01f5a9a11abcbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 2 Aug 2021 15:29:57 +0200
Subject: [PATCH 43/52] Removed containsValue and containsOnlyValue from Array
 and ArrayView

The algorithms are implemented as plain functions in TNL::Algorithms.
containsValue was replaced with contains.
---
 .../Examples/Containers/ArrayExample.cpp      |  10 --
 .../Examples/Containers/ArrayViewExample.cpp  |  10 --
 .../Examples/Containers/VectorExample.cpp     |   2 -
 Documentation/Tutorials/Arrays/CMakeLists.txt |   6 +-
 .../Tutorials/Arrays/ContainsValue.cu         |   1 -
 .../{ContainsValue.cpp => contains.cpp}       |  21 +--
 Documentation/Tutorials/Arrays/contains.cu    |   1 +
 .../Tutorials/Arrays/tutorial_Arrays.md       |  10 +-
 .../tnlFastSweepingMethod2D_impl.h            |   5 +-
 .../tnlFastSweepingMethod3D_impl.h            |   5 +-
 src/TNL/Algorithms/MemoryOperations.h         |  38 -----
 src/TNL/Algorithms/MemoryOperationsCuda.hpp   |  32 ----
 src/TNL/Algorithms/MemoryOperationsHost.hpp   |  44 -----
 .../Algorithms/MemoryOperationsSequential.hpp |  38 -----
 src/TNL/Algorithms/contains.h                 |  75 ++++++++
 src/TNL/Algorithms/detail/Contains.h          | 160 ++++++++++++++++++
 src/TNL/Containers/Array.h                    |  41 +----
 src/TNL/Containers/Array.hpp                  |  34 ----
 src/TNL/Containers/ArrayView.h                |  38 -----
 src/TNL/Containers/ArrayView.hpp              |  28 ---
 src/TNL/Containers/DistributedArray.h         |   6 -
 src/TNL/Containers/DistributedArray.hpp       |  22 ---
 src/TNL/Containers/DistributedArrayView.h     |   6 -
 src/TNL/Containers/DistributedArrayView.hpp   |  30 ----
 .../DistributedMeshes/distributeSubentities.h |   3 +-
 src/UnitTests/Algorithms/CMakeLists.txt       |   1 +
 .../Algorithms/MemoryOperationsTest.h         |  90 ----------
 src/UnitTests/Algorithms/containsTest.cpp     |   1 +
 src/UnitTests/Algorithms/containsTest.cu      |   1 +
 src/UnitTests/Algorithms/containsTest.h       | 106 ++++++++++++
 src/UnitTests/Containers/ArrayTest.h          |  34 ----
 src/UnitTests/Containers/ArrayViewTest.h      |  38 -----
 .../Containers/DistributedArrayTest.h         |  37 ----
 .../Matrices/BinarySparseMatrixCopyTest.h     |   7 +-
 src/UnitTests/Matrices/DenseMatrixCopyTest.h  |   7 +-
 .../Matrices/MultidiagonalMatrixTest.h        |  26 +--
 src/UnitTests/Matrices/SparseMatrixCopyTest.h |   7 +-
 37 files changed, 400 insertions(+), 621 deletions(-)
 delete mode 120000 Documentation/Tutorials/Arrays/ContainsValue.cu
 rename Documentation/Tutorials/Arrays/{ContainsValue.cpp => contains.cpp} (74%)
 create mode 120000 Documentation/Tutorials/Arrays/contains.cu
 create mode 100644 src/TNL/Algorithms/contains.h
 create mode 100644 src/TNL/Algorithms/detail/Contains.h
 create mode 100644 src/UnitTests/Algorithms/containsTest.cpp
 create mode 100644 src/UnitTests/Algorithms/containsTest.cu
 create mode 100644 src/UnitTests/Algorithms/containsTest.h

diff --git a/Documentation/Examples/Containers/ArrayExample.cpp b/Documentation/Examples/Containers/ArrayExample.cpp
index 4d6ca48e5..71945e1df 100644
--- a/Documentation/Examples/Containers/ArrayExample.cpp
+++ b/Documentation/Examples/Containers/ArrayExample.cpp
@@ -40,16 +40,6 @@ void arrayExample()
    a1 = v;
    std::cout << "a1 = " << a1 << std::endl;
 
-   /***
-    * Simple array values checks can be done as follows ...
-    */
-   if( a1.containsValue( 1 ) )
-      std::cout << "a1 contains value 1." << std::endl;
-   if( a1.containsValue( size ) )
-      std::cout << "a1 contains value " << size << "." << std::endl;
-   if( a1.containsOnlyValue( 0 ) )
-      std::cout << "a2 contains only value 0." << std::endl;
-
    /***
     * You may swap array data with the swap method.
     */
diff --git a/Documentation/Examples/Containers/ArrayViewExample.cpp b/Documentation/Examples/Containers/ArrayViewExample.cpp
index fdc1897c5..23207e69f 100644
--- a/Documentation/Examples/Containers/ArrayViewExample.cpp
+++ b/Documentation/Examples/Containers/ArrayViewExample.cpp
@@ -29,16 +29,6 @@ void arrayViewExample()
     */
    a2_view = 0;
 
-   /***
-    * Simple array view values checks can be done as follows ...
-    */
-   if( a1_view.containsValue( 1 ) )
-      std::cout << "a1 contains value 1." << std::endl;
-   if( a1_view.containsValue( size ) )
-      std::cout << "a1 contains value " << size << "." << std::endl;
-   if( a1_view.containsOnlyValue( 0 ) )
-      std::cout << "a2 contains only value 0." << std::endl;
-
    /***
     * More efficient way of array view elements manipulation is with the lambda functions
     */
diff --git a/Documentation/Examples/Containers/VectorExample.cpp b/Documentation/Examples/Containers/VectorExample.cpp
index a3fdf99d9..4f126dfb2 100644
--- a/Documentation/Examples/Containers/VectorExample.cpp
+++ b/Documentation/Examples/Containers/VectorExample.cpp
@@ -13,8 +13,6 @@ void VectorExample()
 {
     Containers::Vector< int, Device > vector1( 5 );
     vector1 = 0;
-    cout << "Does vector contain 1?" << vector1.containsValue( 1 ) << endl;
-    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue( 0 ) << endl;
 
     Containers::Vector< int, Device > vector2( 3 );
     vector2 = 1;
diff --git a/Documentation/Tutorials/Arrays/CMakeLists.txt b/Documentation/Tutorials/Arrays/CMakeLists.txt
index cc1f52267..71facac2f 100644
--- a/Documentation/Tutorials/Arrays/CMakeLists.txt
+++ b/Documentation/Tutorials/Arrays/CMakeLists.txt
@@ -9,8 +9,8 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND ArrayView-2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ArrayView-2.out OUTPUT ArrayView-2.out )
    CUDA_ADD_EXECUTABLE( ArrayViewForElements ArrayViewForElements.cu )
    ADD_CUSTOM_COMMAND( COMMAND ArrayViewForElements > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ArrayViewForElements.out OUTPUT ArrayViewForElements.out )
-   CUDA_ADD_EXECUTABLE( ContainsValue ContainsValue.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ContainsValue > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ContainsValue.out OUTPUT ContainsValue.out )
+   CUDA_ADD_EXECUTABLE( contains contains.cu )
+   ADD_CUSTOM_COMMAND( COMMAND contains > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/contains.out OUTPUT contains.out )
    CUDA_ADD_EXECUTABLE( ElementsAccessing-1 ElementsAccessing-1.cu )
    ADD_CUSTOM_COMMAND( COMMAND ElementsAccessing-1 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ElementsAccessing-1.out OUTPUT ElementsAccessing-1.out )
    CUDA_ADD_EXECUTABLE( ElementsAccessing-2 ElementsAccessing-2.cu )
@@ -25,7 +25,7 @@ ADD_CUSTOM_TARGET( TutorialsArrays-cuda ALL DEPENDS
    ArrayIO.out
    ArrayView-1.out
    ArrayView-2.out
-   ContainsValue.out
+   contains.out
    ElementsAccessing-1.out
    ElementsAccessing-2.out
    ArrayViewForElements.out
diff --git a/Documentation/Tutorials/Arrays/ContainsValue.cu b/Documentation/Tutorials/Arrays/ContainsValue.cu
deleted file mode 120000
index 015d07af1..000000000
--- a/Documentation/Tutorials/Arrays/ContainsValue.cu
+++ /dev/null
@@ -1 +0,0 @@
-ContainsValue.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/ContainsValue.cpp b/Documentation/Tutorials/Arrays/contains.cpp
similarity index 74%
rename from Documentation/Tutorials/Arrays/ContainsValue.cpp
rename to Documentation/Tutorials/Arrays/contains.cpp
index 4b726a7bd..d840c8751 100644
--- a/Documentation/Tutorials/Arrays/ContainsValue.cpp
+++ b/Documentation/Tutorials/Arrays/contains.cpp
@@ -1,9 +1,10 @@
 #include <iostream>
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
+#include <TNL/Algorithms/contains.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Algorithms;
 
 int main( int argc, char* argv[] )
 {
@@ -18,35 +19,35 @@ int main( int argc, char* argv[] )
    /****
     * Test the values stored in the arrays
     */
-   if( a.containsValue( 0.0 ) )
+   if( contains( a, 0.0 ) )
       std::cout << "a contains 0" << std::endl;
 
-   if( a.containsValue( 1.0 ) )
+   if( contains( a, 1.0 ) )
       std::cout << "a contains 1" << std::endl;
 
-   if( b.containsValue( 0.0 ) )
+   if( contains( b, 0.0 ) )
       std::cout << "b contains 0" << std::endl;
 
-   if( b.containsValue( 1.0 ) )
+   if( contains( b, 1.0 ) )
       std::cout << "b contains 1" << std::endl;
 
-   if( a.containsOnlyValue( 0.0 ) )
+   if( containsOnlyValue( a, 0.0 ) )
       std::cout << "a contains only 0" << std::endl;
 
-   if( a.containsOnlyValue( 1.0 ) )
+   if( containsOnlyValue( a, 1.0 ) )
       std::cout << "a contains only 1" << std::endl;
 
-   if( b.containsOnlyValue( 0.0 ) )
+   if( containsOnlyValue( b, 0.0 ) )
       std::cout << "b contains only 0" << std::endl;
 
-   if( b.containsOnlyValue( 1.0 ) )
+   if( containsOnlyValue( b, 1.0 ) )
       std::cout << "b contains only 1" << std::endl;
 
    /****
     * Change the first half of b and test it again
     */
    b.forElements( 0, 5, [=] __cuda_callable__ ( int i, float& value ) { value = 0.0; } );
-   if( b.containsOnlyValue( 0.0, 0, 5 ) )
+   if( containsOnlyValue( b, 0.0, 0, 5 ) )
       std::cout << "First five elements of b contains only 0" << std::endl;
 }
 
diff --git a/Documentation/Tutorials/Arrays/contains.cu b/Documentation/Tutorials/Arrays/contains.cu
new file mode 120000
index 000000000..6b27a9bc0
--- /dev/null
+++ b/Documentation/Tutorials/Arrays/contains.cu
@@ -0,0 +1 @@
+contains.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index ad3016411..a8405e5f0 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -105,13 +105,17 @@ Output:
 
 ### Checking the array contents
 
-Methods `containsValue` and `containsOnlyValue` serve for testing the contents of the arrays. `containsValue` returns `true` of there is at least one element in the array with given value. `containsOnlyValue` returns `true` only if all elements of the array equal given value. The test can be restricted to subinterval of array elements. Both methods are implemented in `Array` as well as in `ArrayView`. See the following code snippet for example of use.
+The functions \ref TNL::Algorithms::contains and \ref TNL::Algorithms::containsOnlyValue serve for testing the contents of arrays, vectors or their views.
+`contains` returns `true` if there is at least one element in the array with given value.
+`containsOnlyValue` returns `true` only if all elements of the array are equal to the given value.
+The test can be restricted to a subinterval of array elements.
+See the following code snippet for usage example.
 
-\include ContainsValue.cpp
+\include contains.cpp
 
 Output:
 
-\include ContainsValue.out
+\include contains.out
 
 ### IO operations with arrays
 
diff --git a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index f4822b6ca..01a5307d7 100644
--- a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Real,
         typename Device,
@@ -323,7 +324,7 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
 
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
 
-           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+           calculateCudaBlocksAgain = Algorithms::contains( blockCalculationIndicator, 1);
           */
   /**------------------------------------------------------------------------------------------------*/
 
@@ -349,7 +350,7 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
           TNL_CHECK_CUDA_DEVICE;
 
           // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
-          calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+          calculateCudaBlocksAgain = Algorithms::contains( blockCalculationIndicator, 1);
 
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
          if( calculateCudaBlocksAgain ){
diff --git a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index e0f6ff58a..e7f82880c 100644
--- a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Real,
         typename Device,
@@ -336,8 +337,8 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
           BlockIterDevice = BlockIterPom;
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
-          // .containsValue(1) is actually parallel reduction implemented in TNL
-          BlockIterD = BlockIterDevice.containsValue(1);
+          // contains(...) is actually parallel reduction implemented in TNL
+          BlockIterD = Algorithms::contains( BlockIterDevice, 1);
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
 
diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h
index 42c37f062..56c3498bf 100644
--- a/src/TNL/Algorithms/MemoryOperations.h
+++ b/src/TNL/Algorithms/MemoryOperations.h
@@ -81,20 +81,6 @@ struct MemoryOperations< Devices::Sequential >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   __cuda_callable__
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   __cuda_callable__
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 template<>
@@ -155,18 +141,6 @@ struct MemoryOperations< Devices::Host >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 template<>
@@ -224,18 +198,6 @@ struct MemoryOperations< Devices::Cuda >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index b5db72b2a..4c84ec697 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -185,37 +185,5 @@ compare( const Element1* destination,
    return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Cuda >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, (Index) 0, "" );
-
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
-}
-
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Cuda >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index dc5aa9b24..7dd2ef1ba 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -169,49 +169,5 @@ compare( const DestinationElement* destination,
    }
 }
 
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Host >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
-      auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
-      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
-   }
-   else {
-      // sequential algorithm can return as soon as it finds a match
-      return MemoryOperations< Devices::Sequential >::containsValue( data, size, value );
-   }
-}
-
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Host >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
-      auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
-      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
-   }
-   else {
-      // sequential algorithm can return as soon as it finds a mismatch
-      return MemoryOperations< Devices::Sequential >::containsOnlyValue( data, size, value );
-   }
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/MemoryOperationsSequential.hpp b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
index 2ea21d0ac..dd7765dec 100644
--- a/src/TNL/Algorithms/MemoryOperationsSequential.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
@@ -147,43 +147,5 @@ compare( const Element1* destination,
    return true;
 }
 
-template< typename Element,
-          typename Index >
-__cuda_callable__
-bool
-MemoryOperations< Devices::Sequential >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   for( Index i = 0; i < size; i++ )
-      if( data[ i ] == value )
-         return true;
-   return false;
-}
-
-template< typename Element,
-          typename Index >
-__cuda_callable__
-bool
-MemoryOperations< Devices::Sequential >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   for( Index i = 0; i < size; i++ )
-      if( ! ( data[ i ] == value ) )
-         return false;
-   return true;
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/contains.h b/src/TNL/Algorithms/contains.h
new file mode 100644
index 000000000..a3a2524f2
--- /dev/null
+++ b/src/TNL/Algorithms/contains.h
@@ -0,0 +1,75 @@
+/***************************************************************************
+                          contains.h  -  description
+                             -------------------
+    begin                : Jul 27, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/detail/Contains.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Checks if an array/vector/view contains an element with given value.
+ *
+ * By default, all elements of the array are checked. If \e begin or \e end is
+ * set to a non-zero value, only elements in the sub-interval `[begin, end)` are
+ * checked.
+ *
+ * \param array The array to be searched.
+ * \param value The value to be checked.
+ * \param begin The beginning of the array sub-interval. It is 0 by default.
+ * \param end The end of the array sub-interval. The default value is 0 which
+ *            is, however, replaced with the array size.
+ * \return `true` if there is _at least one_ element in the sub-interval
+ *         `[begin, end)` which has the value \e value.
+ */
+template< typename Array >
+bool
+contains( const Array& array,
+          typename Array::ValueType value,
+          typename Array::IndexType begin = 0,
+          typename Array::IndexType end = 0 )
+{
+   TNL_ASSERT_TRUE( array.getData(), "Attempted to check a value of an empty array." );
+   if( end == 0 )
+      end = array.getSize();
+   return detail::Contains< typename Array::DeviceType >()( array.getData() + begin, end - begin, value );
+}
+
+/**
+ * \brief Checks if all elements of an array/vector/view have the given value.
+ *
+ * By default, all elements of the array are checked. If \e begin or \e end is
+ * set to a non-zero value, only elements in the sub-interval `[begin, end)` are
+ * checked.
+ *
+ * \param array The array to be searched.
+ * \param value The value to be checked.
+ * \param begin The beginning of the array sub-interval. It is 0 by default.
+ * \param end The end of the array sub-interval. The default value is 0 which
+ *            is, however, replaced with the array size.
+ * \return `true` if _all_ elements in the sub-interval `[begin, end)` have the
+ *         same value \e value.
+ */
+template< typename Array >
+bool
+containsOnlyValue( const Array& array,
+                   typename Array::ValueType value,
+                   typename Array::IndexType begin = 0,
+                   typename Array::IndexType end = 0 )
+{
+   TNL_ASSERT_TRUE( array.getData(), "Attempted to check a value of an empty array." );
+   if( end == 0 )
+      end = array.getSize();
+   return detail::ContainsOnlyValue< typename Array::DeviceType >()( array.getData() + begin, end - begin, value );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/detail/Contains.h b/src/TNL/Algorithms/detail/Contains.h
new file mode 100644
index 000000000..77a191b30
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Contains.h
@@ -0,0 +1,160 @@
+/***************************************************************************
+                          Contains.h  -  description
+                             -------------------
+    begin                : Jul 27, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Algorithms/reduce.h>
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+template< typename Device >
+struct Contains;
+
+template< typename Device >
+struct ContainsOnlyValue;
+
+
+template<>
+struct Contains< Devices::Sequential >
+{
+   template< typename Element,
+             typename Index >
+   __cuda_callable__
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      for( Index i = 0; i < size; i++ )
+         if( data[ i ] == value )
+            return true;
+      return false;
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Sequential >
+{
+   template< typename Element,
+             typename Index >
+   __cuda_callable__
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      for( Index i = 0; i < size; i++ )
+         if( ! ( data[ i ] == value ) )
+            return false;
+      return true;
+   }
+};
+
+
+template<>
+struct Contains< Devices::Host >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+         auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
+         return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+      }
+      else {
+         // sequential algorithm can return as soon as it finds a match
+         return Contains< Devices::Sequential >{}( data, size, value );
+      }
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Host >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+         auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
+         return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      }
+      else {
+         // sequential algorithm can return as soon as it finds a mismatch
+         return ContainsOnlyValue< Devices::Sequential >{}( data, size, value );
+      }
+   }
+};
+
+
+template<>
+struct Contains< Devices::Cuda >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, (Index) 0, "" );
+
+      auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+      return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Cuda >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+      return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   }
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index c33e1283e..7d00683a3 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -56,8 +56,7 @@ template< int, typename > class StaticArray;
  * explicit data transfer which is not buffered, so it can be very slow.
  *
  * Other methods, such as \ref operator=, \ref operator==, \ref operator!=,
- * \ref setValue, \ref containsValue, \ref containsOnlyValue, and \ref evaluate,
- * provide various operations on whole arrays.
+ * \ref setValue, and \ref evaluate, provide various operations on whole arrays.
  *
  * See also \ref ArrayView, \ref Vector, \ref VectorView.
  *
@@ -722,44 +721,6 @@ class Array
       template< typename Function >
       void forAllElements( Function&& f ) const;
 
-      /**
-       * \brief Checks if there is an element with value \e v.
-       *
-       * By default, all elements of the array are checked. If \e begin or
-       * \e end is set to a non-zero value, only elements in the sub-interval
-       * `[begin, end)` are checked.
-       *
-       * \param value The value to be checked.
-       * \param begin The beginning of the array sub-interval. It is 0 by
-       *              default.
-       * \param end The end of the array sub-interval. The default value is 0
-       *            which is, however, replaced with the array size.
-       * \return `true` if there is _at least one_ element in the sub-interval
-       *         `[begin, end)` which has the value \e value.
-       */
-      bool containsValue( ValueType value,
-                          IndexType begin = 0,
-                          IndexType end = 0 ) const;
-
-      /**
-       * \brief Checks if all elements have the same value \e v.
-       *
-       * By default, all elements of the array are checked. If \e begin or
-       * \e end is set to a non-zero value, only elements in the sub-interval
-       * `[begin, end)` are checked.
-       *
-       * \param value The value to be checked.
-       * \param begin The beginning of the array sub-interval. It is 0 by
-       *              default.
-       * \param end The end of the array sub-interval. The default value is 0
-       *            which is, however, replaced with the array size.
-       * \return `true` if _all_ elements in the sub-interval `[begin, end)`
-       *         have the same value \e value.
-       */
-      bool containsOnlyValue( ValueType value,
-                              IndexType begin = 0,
-                              IndexType end = 0 ) const;
-
       /**
        * \brief Method for saving the array to a binary file \e fileName.
        *
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index f6a25925f..e01566e50 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -756,40 +756,6 @@ forAllElements( Function&& f ) const
    view.forAllElements( f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-Array< Value, Device, Index, Allocator >::
-containsValue( ValueType value,
-               IndexType begin,
-               IndexType end ) const
-{
-   TNL_ASSERT_TRUE( this->getData(), "Attempted to check a value of an empty array." );
-   if( end == 0 )
-      end = this->getSize();
-
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-Array< Value, Device, Index, Allocator >::
-containsOnlyValue( ValueType value,
-                   IndexType begin,
-                   IndexType end ) const
-{
-   TNL_ASSERT_TRUE( this->getData(), "Attempted to check a value of an empty array." );
-   if( end == 0 )
-      end = this->getSize();
-
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
-}
-
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 46fc5c37a..b3e2416e0 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -541,44 +541,6 @@ public:
    template< typename Function >
    void forAllElements( Function&& f ) const;
 
-   /**
-    * \brief Checks if there is an element with value \e v.
-    *
-    * By default, all elements of the array view are checked. If \e begin or
-    * \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are checked.
-    *
-    * \param value The value to be checked.
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array view size.
-    * \return `true` if there is _at least one_ element in the sub-interval
-    *         `[begin, end)` which has the value \e value.
-    */
-   bool containsValue( ValueType value,
-                       IndexType begin = 0,
-                       IndexType end = 0 ) const;
-
-   /**
-    * \brief Checks if all elements have the same value \e v.
-    *
-    * By default, all elements of the array view are checked. If \e begin or
-    * \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are checked.
-    *
-    * \param value The value to be checked.
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array view size.
-    * \return `true` if _all_ elements in the sub-interval `[begin, end)`
-    *         have the same value \e value.
-    */
-   bool containsOnlyValue( ValueType value,
-                           IndexType begin = 0,
-                           IndexType end = 0 ) const;
-
    /**
     * \brief Method for saving the data to a binary file \e fileName.
     *
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 0e2bb7b77..7771f7dc6 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -396,34 +396,6 @@ forAllElements( Function&& f ) const
    this->forElements( 0, this->getSize(), f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-ArrayView< Value, Device, Index >::
-containsValue( ValueType value,
-               IndexType begin,
-               IndexType end ) const
-{
-   if( end == 0 )
-      end = this->getSize();
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-ArrayView< Value, Device, Index >::
-containsOnlyValue( ValueType value,
-                   IndexType begin,
-                   IndexType end ) const
-{
-   if( end == 0 )
-      end = this->getSize();
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
-}
-
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 15d8eaa53..2c2690acd 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -254,12 +254,6 @@ public:
       void forElements( IndexType begin, IndexType end, Function&& f ) const;
 
 
-   // Checks if there is an element with given value in this array
-   bool containsValue( ValueType value ) const;
-
-   // Checks if all elements in this array have the same given value
-   bool containsOnlyValue( ValueType value ) const;
-
    // TODO: serialization (save, load)
 
 protected:
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index dcfaeee2d..bda82c8bd 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -473,27 +473,5 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
    this->view.forElements( begin, end, f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-DistributedArray< Value, Device, Index, Allocator >::
-containsValue( ValueType value ) const
-{
-   return view.containsValue( value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-DistributedArray< Value, Device, Index, Allocator >::
-containsOnlyValue( ValueType value ) const
-{
-   return view.containsOnlyValue( value );
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 9da306744..b99d08076 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -230,12 +230,6 @@ public:
       template< typename Function >
       void forElements( IndexType begin, IndexType end, Function&& f ) const;
 
-   // Checks if there is an element with given value in this array
-   bool containsValue( ValueType value ) const;
-
-   // Checks if all elements in this array have the same given value
-   bool containsOnlyValue( ValueType value ) const;
-
    std::ostream& print( std::ostream& str ) const;
 protected:
    LocalRangeType localRange;
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 40b6f509a..cb9edba19 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -472,36 +472,6 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
 }
 
 
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-DistributedArrayView< Value, Device, Index >::
-containsValue( ValueType value ) const
-{
-   bool result = false;
-   if( group != MPI::NullGroup() ) {
-      const bool localResult = localData.containsValue( value );
-      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group );
-   }
-   return result;
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-DistributedArrayView< Value, Device, Index >::
-containsOnlyValue( ValueType value ) const
-{
-   bool result = true;
-   if( group != MPI::NullGroup() ) {
-      const bool localResult = localData.containsOnlyValue( value );
-      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   }
-   return result;
-}
-
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
index 38cc5ccdf..cfe5a9246 100644
--- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
+++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
@@ -15,6 +15,7 @@
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/MeshDetails/layers/EntityTags/Traits.h>
 #include <TNL/Algorithms/scan.h>
+#include <TNL/Algorithms/contains.h>
 
 namespace TNL {
 namespace Meshes {
@@ -391,7 +392,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
       if( all_done )
          break;
    }
-   if( mesh.template getGlobalIndices< Dimension >().containsValue( padding_index ) )
+   if( Algorithms::contains( mesh.template getGlobalIndices< Dimension >(), padding_index ) )
       throw std::runtime_error( "some global indices were left unset" );
 
    // 7. reorder the entities to make sure that global indices are sorted
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 628ca8dee..0738a6f8a 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -2,6 +2,7 @@ ADD_SUBDIRECTORY( Segments )
 ADD_SUBDIRECTORY( Sorting )
 
 set( COMMON_TESTS
+         containsTest
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
diff --git a/src/UnitTests/Algorithms/MemoryOperationsTest.h b/src/UnitTests/Algorithms/MemoryOperationsTest.h
index ebfb01f1b..61938d82d 100644
--- a/src/UnitTests/Algorithms/MemoryOperationsTest.h
+++ b/src/UnitTests/Algorithms/MemoryOperationsTest.h
@@ -144,44 +144,6 @@ TYPED_TEST( MemoryOperationsTest, compareWithConversion_host )
    allocator2.deallocate( data2, ARRAY_TEST_SIZE );
 }
 
-TYPED_TEST( MemoryOperationsTest, containsValue_host )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using Allocator = Allocators::Host< ValueType >;
-
-   Allocator allocator;
-   ValueType* data = allocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = i % 10;
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( ( MemoryOperations< Devices::Host >::containsValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Host >::containsValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   allocator.deallocate( data, ARRAY_TEST_SIZE );
-}
-
-TYPED_TEST( MemoryOperationsTest, containsOnlyValue_host )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using Allocator = Allocators::Host< ValueType >;
-
-   Allocator allocator;
-   ValueType* data = allocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = i % 10;
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Host >::containsOnlyValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = 10;
-   EXPECT_TRUE( ( MemoryOperations< Devices::Host >::containsOnlyValue( data, ARRAY_TEST_SIZE, (ValueType) 10 ) ) );
-
-   allocator.deallocate( data, ARRAY_TEST_SIZE );
-}
-
 
 #ifdef HAVE_CUDA
 TYPED_TEST( MemoryOperationsTest, allocateMemory_cuda )
@@ -353,58 +315,6 @@ TYPED_TEST( MemoryOperationsTest, compareWithConversions_cuda )
    cudaAllocator1.deallocate( deviceData, ARRAY_TEST_SIZE );
    cudaAllocator2.deallocate( deviceData2, ARRAY_TEST_SIZE );
 }
-
-TYPED_TEST( MemoryOperationsTest, containsValue_cuda )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using HostAllocator = Allocators::Host< ValueType >;
-   using CudaAllocator = Allocators::Cuda< ValueType >;
-
-   HostAllocator hostAllocator;
-   CudaAllocator cudaAllocator;
-   ValueType* hostData = hostAllocator.allocate( ARRAY_TEST_SIZE );
-   ValueType* deviceData = cudaAllocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = i % 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( ( MemoryOperations< Devices::Cuda >::containsValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Cuda >::containsValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   hostAllocator.deallocate( hostData, ARRAY_TEST_SIZE );
-   cudaAllocator.deallocate( deviceData, ARRAY_TEST_SIZE );
-}
-
-TYPED_TEST( MemoryOperationsTest, containsOnlyValue_cuda )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using HostAllocator = Allocators::Host< ValueType >;
-   using CudaAllocator = Allocators::Cuda< ValueType >;
-
-   HostAllocator hostAllocator;
-   CudaAllocator cudaAllocator;
-   ValueType* hostData = hostAllocator.allocate( ARRAY_TEST_SIZE );
-   ValueType* deviceData = cudaAllocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = i % 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Cuda >::containsOnlyValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   EXPECT_TRUE( ( MemoryOperations< Devices::Cuda >::containsOnlyValue( deviceData, ARRAY_TEST_SIZE, (ValueType) 10 ) ) );
-
-   hostAllocator.deallocate( hostData, ARRAY_TEST_SIZE );
-   cudaAllocator.deallocate( deviceData, ARRAY_TEST_SIZE );
-}
 #endif // HAVE_CUDA
 #endif // HAVE_GTEST
 
diff --git a/src/UnitTests/Algorithms/containsTest.cpp b/src/UnitTests/Algorithms/containsTest.cpp
new file mode 100644
index 000000000..7435d1282
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.cpp
@@ -0,0 +1 @@
+#include "containsTest.h"
diff --git a/src/UnitTests/Algorithms/containsTest.cu b/src/UnitTests/Algorithms/containsTest.cu
new file mode 100644
index 000000000..7435d1282
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.cu
@@ -0,0 +1 @@
+#include "containsTest.h"
diff --git a/src/UnitTests/Algorithms/containsTest.h b/src/UnitTests/Algorithms/containsTest.h
new file mode 100644
index 000000000..6598924ff
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.h
@@ -0,0 +1,106 @@
+/***************************************************************************
+                          ContainsTest.h  -  description
+                             -------------------
+    begin                : Jul 15, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/contains.h>
+
+#include "gtest/gtest.h"
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Containers;
+
+// test fixture for typed tests
+template< typename Array >
+class ContainsTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+};
+
+// types for which ContainsTest is instantiated
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< int,    Devices::Sequential, short >,
+   Array< long,   Devices::Sequential, short >,
+   Array< double, Devices::Sequential, short >,
+   Array< int,    Devices::Sequential, int >,
+   Array< long,   Devices::Sequential, int >,
+   Array< double, Devices::Sequential, int >,
+   Array< int,    Devices::Sequential, long >,
+   Array< long,   Devices::Sequential, long >,
+   Array< double, Devices::Sequential, long >,
+
+   Array< int,    Devices::Host, short >,
+   Array< long,   Devices::Host, short >,
+   Array< double, Devices::Host, short >,
+   Array< int,    Devices::Host, int >,
+   Array< long,   Devices::Host, int >,
+   Array< double, Devices::Host, int >,
+   Array< int,    Devices::Host, long >,
+   Array< long,   Devices::Host, long >,
+   Array< double, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< int,    Devices::Cuda, short >,
+   Array< long,   Devices::Cuda, short >,
+   Array< double, Devices::Cuda, short >,
+   Array< int,    Devices::Cuda, int >,
+   Array< long,   Devices::Cuda, int >,
+   Array< double, Devices::Cuda, int >,
+   Array< int,    Devices::Cuda, long >,
+   Array< long,   Devices::Cuda, long >,
+   Array< double, Devices::Cuda, long >
+#endif
+>;
+
+TYPED_TEST_SUITE( ContainsTest, ArrayTypes );
+
+TYPED_TEST( ContainsTest, contains )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType array;
+   array.setSize( 1024 );
+
+   for( int i = 0; i < array.getSize(); i++ )
+      array.setElement( i, i % 10 );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_TRUE( contains( array, i ) );
+
+   for( int i = 10; i < 20; i++ )
+      EXPECT_FALSE( contains( array, i ) );
+}
+
+TYPED_TEST( ContainsTest, containsOnlyValue )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType array;
+   array.setSize( 1024 );
+
+   for( int i = 0; i < array.getSize(); i++ )
+      array.setElement( i, i % 10 );
+
+   for( int i = 0; i < 20; i++ )
+      EXPECT_FALSE( containsOnlyValue( array, i ) );
+
+   array.setValue( 100 );
+   EXPECT_TRUE( containsOnlyValue( array, 100 ) );
+}
+
+#endif // HAVE_GTEST
+
+
+#include "../main.h"
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index fe7dd55e4..78bd388a4 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -512,40 +512,6 @@ TYPED_TEST( ArrayTest, forElements )
    testArrayForEachElement< typename TestFixture::ArrayType >();
 }
 
-TYPED_TEST( ArrayTest, containsValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-
-   ArrayType array;
-   array.setSize( 1024 );
-
-   for( int i = 0; i < array.getSize(); i++ )
-      array.setElement( i, i % 10 );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( array.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( array.containsValue( i ) );
-}
-
-TYPED_TEST( ArrayTest, containsOnlyValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-
-   ArrayType array;
-   array.setSize( 1024 );
-
-   for( int i = 0; i < array.getSize(); i++ )
-      array.setElement( i, i % 10 );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( array.containsOnlyValue( i ) );
-
-   array.setValue( 100 );
-   EXPECT_TRUE( array.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( ArrayTest, comparisonOperator )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index 8b9e8157b..93c593bd3 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -361,44 +361,6 @@ TYPED_TEST( ArrayViewTest, evaluate )
    ArrayViewEvaluateTest( u );
 }
 
-TYPED_TEST( ArrayViewTest, containsValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-
-   ArrayType a;
-   a.setSize( 1024 );
-   ViewType v = a.getView();
-
-   for( int i = 0; i < v.getSize(); i++ )
-      v.setElement( i, i % 10 );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( v.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( v.containsValue( i ) );
-}
-
-TYPED_TEST( ArrayViewTest, containsOnlyValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-
-   ArrayType a;
-   a.setSize( 1024 );
-   ViewType v = a.getView();
-
-   for( int i = 0; i < v.getSize(); i++ )
-      v.setElement( i, i % 10 );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( v.containsOnlyValue( i ) );
-
-   a.setValue( 100 );
-   EXPECT_TRUE( v.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( ArrayViewTest, comparisonOperator )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index e25739afe..bc0edb445 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -291,43 +291,6 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators )
    EXPECT_TRUE( u == v );
 }
 
-TYPED_TEST( DistributedArrayTest, containsValue )
-{
-   using IndexType = typename TestFixture::IndexType;
-
-   const auto localRange = this->distributedArray.getLocalRange();
-
-   for( int i = 0; i < localRange.getSize(); i++ ) {
-      const IndexType gi = localRange.getGlobalIndex( i );
-      this->distributedArray.setElement( gi, i % 10 );
-   }
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( this->distributedArray.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( this->distributedArray.containsValue( i ) );
-}
-
-TYPED_TEST( DistributedArrayTest, containsOnlyValue )
-{
-   using IndexType = typename TestFixture::IndexType;
-
-   const auto localRange = this->distributedArray.getLocalRange();
-
-   for( int i = 0; i < localRange.getSize(); i++ ) {
-      const IndexType gi = localRange.getGlobalIndex( i );
-      this->distributedArray.setElement( gi, i % 10 );
-   }
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) );
-
-   this->distributedArray.setValue( 100 );
-   this->distributedArray.waitForSynchronization();
-   EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( DistributedArrayTest, empty )
 {
    EXPECT_GT( this->distributedArray.getSize(), 0 );
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index 0f2b00595..9bfd551be 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -492,7 +493,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, TNL::min( i + j, 1 ) );
 
    Matrix matrix;
@@ -509,7 +510,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), TNL::min( i + j, 1 ) );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -524,7 +525,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), TNL::min( i + j, 1 ) );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index fb1277ea2..b0cc4d9ac 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -458,7 +459,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, i + j );
 
    Matrix matrix;
@@ -471,7 +472,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -486,7 +487,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
diff --git a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
index 0f2a4a632..b4437a555 100644
--- a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
@@ -8,16 +8,16 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#include <iostream>
 #include <sstream>
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/Matrix.h>
 #include <TNL/Matrices/MultidiagonalMatrix.h>
+#include <TNL/Algorithms/contains.h>
 #include <TNL/Containers/Array.h>
-
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
-#include <iostream>
 
 using Multidiagonal_host_float = TNL::Matrices::MultidiagonalMatrix< float, TNL::Devices::Host, int >;
 using Multidiagonal_host_int = TNL::Matrices::MultidiagonalMatrix< int, TNL::Devices::Host, int >;
@@ -174,7 +174,7 @@ void test_SetElements()
          {
             for( int k = 0; k < matrixSize; k++ )
             {
-               if( k == elementIdx - gridSize || 
+               if( k == elementIdx - gridSize ||
                    k == elementIdx - 1 ||
                    k == elementIdx + 1 ||
                    k == elementIdx + gridSize )
@@ -403,7 +403,7 @@ void test_SetElement()
    RealType value = 1;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.setElement( i, j, value++ );
          else
          {
@@ -466,7 +466,7 @@ void test_AddElement()
    RealType value = 1;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
          {
             if( j >= i )
                m.setElement( i, j, value );
@@ -524,7 +524,7 @@ void test_AddElement()
    RealType multiplicator = 2;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.addElement( i, j, value++, multiplicator );
          else
          {
@@ -669,7 +669,7 @@ void test_AddRow()
       for( IndexType j = 0; j < cols; j++ )
       {
          IndexType offset = j - i;
-         if( diagonals.containsValue( offset ) && offset >= 0)
+         if( TNL::Algorithms::contains( diagonals, offset ) && offset >= 0)
             m.setElement( i, j, value );
          value++;
       }
@@ -883,7 +883,7 @@ void test_VectorProduct()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++)
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.setElement( i, j, value );
          value++;
       }
@@ -1285,7 +1285,7 @@ void test_AssignmentOperator()
    MultidiagonalHost hostMatrix( rows, columns, diagonalsOffsets );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j <  columns; j++ )
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             hostMatrix.setElement( i, j,  i + j );
 
    Matrix matrix( rows, columns, diagonalsOffsets );
@@ -1293,7 +1293,7 @@ void test_AssignmentOperator()
    matrix = hostMatrix;
    for( IndexType i = 0; i < columns; i++ )
       for( IndexType j = 0; j < rows; j++ )
-            if( diagonalsOffsets.containsValue( j - i ) )
+            if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
                EXPECT_EQ( matrix.getElement( i, j ), i + j );
             else
                EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -1302,7 +1302,7 @@ void test_AssignmentOperator()
    MultidiagonalCuda cudaMatrix( rows, columns, diagonalsOffsets );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             cudaMatrix.setElement( i, j, i + j );
 
    matrix.getValues() = 0.0;
@@ -1310,7 +1310,7 @@ void test_AssignmentOperator()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -1345,7 +1345,7 @@ void test_SaveAndLoad()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
       {
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             savedMatrix.setElement( i, j, value );
          value++;
       }
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index 098a3e0a4..81a7f26c9 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -495,7 +496,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, i + j );
 
    Matrix matrix;
@@ -512,7 +513,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -527,7 +528,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
-- 
GitLab


From 71a5b208b5291e89af7f886d48ca697008eb85e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 6 Aug 2021 10:15:03 +0200
Subject: [PATCH 44/52] Moved HasStaticGetSerializationType from TypeTraits.h
 into TypeInfo.h

---
 src/TNL/TypeInfo.h   | 41 ++++++++++++++++++++++++++++++++++-------
 src/TNL/TypeTraits.h | 27 ---------------------------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/TNL/TypeInfo.h b/src/TNL/TypeInfo.h
index 61377fbb8..36940e868 100644
--- a/src/TNL/TypeInfo.h
+++ b/src/TNL/TypeInfo.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <typeinfo>
+#include <type_traits>
 #include <string>
 
 #if defined( __has_include )
@@ -27,11 +28,10 @@
    #include <cstdlib>  // std::free
 #endif
 
-#include <TNL/TypeTraits.h>
 #include <TNL/String.h>
 
 namespace TNL {
-namespace __getType_impl {
+namespace detail {
 
 inline std::string
 demangle( const char* name )
@@ -49,7 +49,34 @@ demangle( const char* name )
    return name;
 }
 
-} // namespace __getType_impl
+/**
+ * \brief Type trait for checking if T has a static getSerializationType method.
+ */
+template< typename T >
+class HasStaticGetSerializationType
+{
+private:
+   template< typename U >
+   static constexpr auto check(U*)
+   -> typename
+      std::enable_if_t<
+         ! std::is_same<
+               decltype( U::getSerializationType() ),
+               void
+            >::value,
+         std::true_type
+      >;
+
+   template< typename >
+   static constexpr std::false_type check(...);
+
+   using type = decltype(check<std::decay_t<T>>(0));
+
+public:
+    static constexpr bool value = type::value;
+};
+
+} // namespace detail
 
 /**
  * \brief Returns a human-readable string representation of given type.
@@ -61,7 +88,7 @@ demangle( const char* name )
 template< typename T >
 String getType()
 {
-   return __getType_impl::demangle( typeid(T).name() );
+   return detail::demangle( typeid(T).name() );
 }
 
 /**
@@ -74,7 +101,7 @@ String getType()
 template< typename T >
 String getType( T&& obj )
 {
-   return __getType_impl::demangle( typeid(obj).name() );
+   return detail::demangle( typeid(obj).name() );
 }
 
 /**
@@ -87,7 +114,7 @@ String getType( T&& obj )
  * serialization type for multiple devices.
  */
 template< typename T,
-          std::enable_if_t< ! HasStaticGetSerializationType< T >::value, bool > = true >
+          std::enable_if_t< ! detail::HasStaticGetSerializationType< T >::value, bool > = true >
 String getSerializationType()
 {
    return getType< T >();
@@ -98,7 +125,7 @@ String getSerializationType()
  *        static \e getSerializationType method to override the default behaviour.
  */
 template< typename T,
-          std::enable_if_t< HasStaticGetSerializationType< T >::value, bool > = true >
+          std::enable_if_t< detail::HasStaticGetSerializationType< T >::value, bool > = true >
 String getSerializationType()
 {
    return T::getSerializationType();
diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h
index c5d0fea36..42ecb9d63 100644
--- a/src/TNL/TypeTraits.h
+++ b/src/TNL/TypeTraits.h
@@ -226,33 +226,6 @@ struct IsViewType
             std::is_same< typename std::decay_t<T>::ViewType, T >::value >
 {};
 
-/**
- * \brief Type trait for checking if T has a static getSerializationType method.
- */
-template< typename T >
-class HasStaticGetSerializationType
-{
-private:
-   template< typename U >
-   static constexpr auto check(U*)
-   -> typename
-      std::enable_if_t<
-         ! std::is_same<
-               decltype( U::getSerializationType() ),
-               void
-            >::value,
-         std::true_type
-      >;
-
-   template< typename >
-   static constexpr std::false_type check(...);
-
-   using type = decltype(check<std::decay_t<T>>(0));
-
-public:
-    static constexpr bool value = type::value;
-};
-
 /**
  * \brief Type trait for checking if T has getCommunicationGroup method.
  */
-- 
GitLab


From d8e38db3f4620b9ab09ecf1808a3680a399e01cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 6 Aug 2021 12:59:05 +0200
Subject: [PATCH 45/52] Simplified type traits in Math.h, removed pre-C++14
 code

---
 src/TNL/Math.h | 33 +++++++--------------------------
 1 file changed, 7 insertions(+), 26 deletions(-)

diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index cb583c03c..220f6ad54 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -27,20 +27,11 @@ namespace TNL {
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! HasSubscriptOperator<T1>::value && ! HasSubscriptOperator<T2>::value, bool > = true >
-__cuda_callable__
-ResultType min( const T1& a, const T2& b )
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
+constexpr ResultType min( const T1& a, const T2& b )
 {
-#if __cplusplus >= 201402L
    // std::min is constexpr since C++14 so it can be reused directly
    return std::min( (ResultType) a, (ResultType) b );
-#else
- #if defined(__CUDA_ARCH__)
-   return ::min( (ResultType) a, (ResultType) b );
- #else
-   return std::min( (ResultType) a, (ResultType) b );
- #endif
-#endif
 }
 
 /**
@@ -49,8 +40,7 @@ ResultType min( const T1& a, const T2& b )
  * The inputs are folded with the \ref min function from the left to the right.
  */
 template< typename T1, typename T2, typename T3, typename... Ts >
-__cuda_callable__
-typename std::common_type< T1, T2, T3, Ts... >::type
+constexpr typename std::common_type< T1, T2, T3, Ts... >::type
 min( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
 {
    return min( min( std::forward<T1>(val1), std::forward<T2>(val2) ),
@@ -65,20 +55,11 @@ min( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! HasSubscriptOperator<T1>::value && ! HasSubscriptOperator<T2>::value, bool > = true >
-__cuda_callable__
-ResultType max( const T1& a, const T2& b )
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
+constexpr ResultType max( const T1& a, const T2& b )
 {
-#if __cplusplus >= 201402L
    // std::max is constexpr since C++14 so it can be reused directly
    return std::max( (ResultType) a, (ResultType) b );
-#else
- #if defined(__CUDA_ARCH__)
-   return ::max( (ResultType) a, (ResultType) b );
- #else
-   return std::max( (ResultType) a, (ResultType) b );
- #endif
-#endif
 }
 
 /**
@@ -99,7 +80,7 @@ max( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
  * \brief This function returns absolute value of given number \e n.
  */
 template< class T,
-          std::enable_if_t< ! std::is_unsigned<T>::value && ! std::is_class<T>::value, bool > = true >
+          std::enable_if_t< std::is_arithmetic<T>::value && ! std::is_unsigned<T>::value, bool > = true >
 __cuda_callable__
 T abs( const T& n )
 {
@@ -169,7 +150,7 @@ ResultType argAbsMax( const T1& a, const T2& b )
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! std::is_class<T1>::value && ! std::is_class<T2>::value, bool > = true >
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
 __cuda_callable__
 ResultType pow( const T1& base, const T2& exp )
 {
-- 
GitLab


From 2d454b1535fc237c66ab510c3b10629e4d888f02 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 6 Aug 2021 13:19:55 +0200
Subject: [PATCH 46/52] Added IsScalarType trait for the detection of scalar
 types in vector expressions

This is needed because custom specializations of std::is_arithmetic
cannot be used (they cause an undefined behaviour).
---
 src/TNL/Containers/Expressions/TypeTraits.h | 10 +---------
 src/TNL/TypeTraits.h                        | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/TNL/Containers/Expressions/TypeTraits.h b/src/TNL/Containers/Expressions/TypeTraits.h
index 3142ee35e..943d61470 100644
--- a/src/TNL/Containers/Expressions/TypeTraits.h
+++ b/src/TNL/Containers/Expressions/TypeTraits.h
@@ -107,20 +107,12 @@ using RemoveET = typename RemoveExpressionTemplate< R >::type;
 
 template< typename T1, typename T2 >
 constexpr std::enable_if_t<
-      ! ( std::is_arithmetic< T1 >::value && std::is_arithmetic< T2 >::value ) &&
       ! ( IsStaticArrayType< T1 >::value && IsStaticArrayType< T2 >::value ) &&
       ! ( IsArrayType< T1 >::value && IsArrayType< T2 >::value )
 , bool >
 compatibleForVectorAssignment()
 {
-   return false;
-}
-
-template< typename T1, typename T2 >
-constexpr std::enable_if_t< std::is_arithmetic< T1 >::value && std::is_arithmetic< T2 >::value, bool >
-compatibleForVectorAssignment()
-{
-   return true;
+   return IsScalarType< T1 >::value && IsScalarType< T2 >::value;
 }
 
 template< typename T1, typename T2 >
diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h
index 42ecb9d63..3a199e1b2 100644
--- a/src/TNL/TypeTraits.h
+++ b/src/TNL/TypeTraits.h
@@ -130,6 +130,20 @@ public:
     static constexpr bool value = type::value;
 };
 
+/**
+ * \brief Type trait for checking if T is a [scalar type](https://en.wikipedia.org/wiki/Scalar_(mathematics))
+ * (in the mathemtatical sense). Not to be confused with \ref std::is_scalar.
+ *
+ * For example, \ref std::is_arithmetic "arithmetic types" as defined by the STL
+ * are scalar types. TNL also provides additional scalar types, e.g. for
+ * extended precision arithmetics. Users may also define specializations of this
+ * trait class for their custom scalar types.
+ */
+template< typename T >
+struct IsScalarType
+: public std::is_arithmetic< T >
+{};
+
 /**
  * \brief Type trait for checking if T is an array type, e.g.
  *        \ref Containers::Array or \ref Containers::Vector.
-- 
GitLab


From 1433c746ce0daa9e5771f23d7d3bda6cf0f7b42e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 2 Aug 2021 22:57:41 +0200
Subject: [PATCH 47/52] Added tests of the reduction and scan algorithm with
 CustomScalar

This way we test both the general CUDA implementation using shared
memory and the specialization using __shfl instructions.

Both the reduction and scan kernels needed some tweaks due to shared
memory usage with non-fundamental types.
---
 .../Algorithms/detail/CudaReductionKernel.h   |  22 +-
 src/TNL/Algorithms/detail/CudaScanKernel.h    |  39 ++-
 src/UnitTests/Algorithms/CMakeLists.txt       |   3 +-
 src/UnitTests/Algorithms/reduceTest.h         | 320 ++++++++++--------
 .../{reduceTest.cu => reduceTestCuda.cu}      |   0
 src/UnitTests/Algorithms/scanTest.h           |  10 +
 src/UnitTests/CMakeLists.txt                  |   1 +
 src/UnitTests/CustomScalar.h                  | 267 +++++++++++++++
 src/UnitTests/CustomScalarTest.cpp            |  43 +++
 9 files changed, 551 insertions(+), 154 deletions(-)
 rename src/UnitTests/Algorithms/{reduceTest.cu => reduceTestCuda.cu} (100%)
 create mode 100644 src/UnitTests/CustomScalar.h
 create mode 100644 src/UnitTests/CustomScalarTest.cpp

diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 126884eaf..c08c686eb 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -388,7 +388,14 @@ CudaReductionKernel( DataFetcher dataFetcher,
 
    // allocate shared memory
    using BlockReduce = CudaBlockReduce< blockSize, Reduction, Result >;
-   __shared__ typename BlockReduce::Storage storage;
+   union Shared {
+      typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
 
    // Calculate the grid size (stride of the sequential reduction loop).
    const Index gridSize = blockDim.x * gridDim.x;
@@ -416,7 +423,7 @@ CudaReductionKernel( DataFetcher dataFetcher,
    __syncthreads();
 
    // Perform the parallel reduction.
-   result = BlockReduce::reduce( reduction, identity, result, threadIdx.x, storage );
+   result = BlockReduce::reduce( reduction, identity, result, threadIdx.x, storage.blockReduceStorage );
 
    // Store the result back in the global memory.
    if( threadIdx.x == 0 )
@@ -443,7 +450,14 @@ CudaReductionWithArgumentKernel( DataFetcher dataFetcher,
 
    // allocate shared memory
    using BlockReduce = CudaBlockReduceWithArgument< blockSize, Reduction, Result, Index >;
-   __shared__ typename BlockReduce::Storage storage;
+   union Shared {
+      typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
 
    // Calculate the grid size (stride of the sequential reduction loop).
    const Index gridSize = blockDim.x * gridDim.x;
@@ -504,7 +518,7 @@ CudaReductionWithArgumentKernel( DataFetcher dataFetcher,
    __syncthreads();
 
    // Perform the parallel reduction.
-   const std::pair< Result, Index > result_pair = BlockReduce::reduceWithArgument( reduction, identity, result, initialIndex, threadIdx.x, storage );
+   const std::pair< Result, Index > result_pair = BlockReduce::reduceWithArgument( reduction, identity, result, initialIndex, threadIdx.x, storage.blockReduceStorage );
 
    // Store the result back in the global memory.
    if( threadIdx.x == 0 ) {
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 0c170450c..be1c5a380 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -430,6 +430,10 @@ CudaScanKernelUpsweep( const InputView input,
    union Shared {
       ValueType data[ blockSize * valuesPerThread ];
       typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
    };
    __shared__ Shared storage;
 
@@ -501,13 +505,20 @@ CudaScanKernelDownsweep( const InputView input,
    using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
 
    // allocate shared memory
-   __shared__ typename TileScan::Storage storage;
+   union Shared {
+      typename TileScan::Storage tileScanStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
 
    // load the reduction of the previous tiles
    shift = reduction( shift, reductionResults[ blockIdx.x ] );
 
    // scan from input into output
-   TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, shift, storage );
+   TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, shift, storage.tileScanStorage );
 }
 
 /* CudaScanKernelParallel - scan each tile of the input separately in each CUDA
@@ -534,10 +545,17 @@ CudaScanKernelParallel( const InputView input,
    using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
 
    // allocate shared memory
-   __shared__ typename TileScan::Storage storage;
+   union Shared {
+      typename TileScan::Storage tileScanStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
 
    // scan from input into output
-   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, identity, storage );
+   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, identity, storage.tileScanStorage );
 
    // The last thread of the block stores the block result in the global memory.
    if( blockResults && threadIdx.x == blockDim.x - 1 )
@@ -565,9 +583,16 @@ CudaScanKernelUniformShift( OutputView output,
                             typename OutputView::ValueType shift )
 {
    // load the block result into a __shared__ variable first
-   __shared__ typename OutputView::ValueType blockResult;
+   union Shared {
+      typename OutputView::ValueType blockResult;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
    if( threadIdx.x == 0 )
-      blockResult = blockResults[ blockIdx.x ];
+      storage.blockResult = blockResults[ blockIdx.x ];
 
    // update the output offset for the thread
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelUniformShift" );
@@ -577,7 +602,7 @@ CudaScanKernelUniformShift( OutputView output,
 
    // update the block shift
    __syncthreads();
-   shift = reduction( shift, blockResult );
+   shift = reduction( shift, storage.blockResult );
 
    int valueIdx = 0;
    while( valueIdx < valuesPerThread && outputBegin < outputEnd )
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 0738a6f8a..aa14ae462 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -6,16 +6,17 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
-         reduceTest
          staticForTest
          unrolledForTest
 )
 
 set( CPP_TESTS
+         reduceTest
          scanTest
          SegmentedScanTest
 )
 set( CUDA_TESTS
+         reduceTestCuda
          scanTestCuda
 )
 if( BUILD_CUDA )
diff --git a/src/UnitTests/Algorithms/reduceTest.h b/src/UnitTests/Algorithms/reduceTest.h
index 6b1565d71..c39215f47 100644
--- a/src/UnitTests/Algorithms/reduceTest.h
+++ b/src/UnitTests/Algorithms/reduceTest.h
@@ -1,244 +1,280 @@
-/***************************************************************************
-                          reduceTest.h  -  description
-                             -------------------
-    begin                : Jul 2, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
 #pragma once
 
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Arithmetics/Quad.h>
 #include <TNL/Containers/Array.h>
 #include <TNL/Algorithms/reduce.h>
+#include "../CustomScalar.h"
 
-#ifdef HAVE_GTEST
-#include <gtest/gtest.h>
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::detail;
+
+// test fixture for typed tests
+template< typename Array >
+class ReduceTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+};
+
+// types for which ReduceTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< CustomScalar< int >, Devices::Sequential, long >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< CustomScalar< int >, Devices::Host, int >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< CustomScalar< int >, Devices::Host, long >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Cuda, int >,  // the reduction kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, int >,
+   Array< long,           Devices::Cuda, int >,
+   Array< double,         Devices::Cuda, int >,
+   //Array< Quad< float >,  Devices::Cuda, int >,
+   //Array< Quad< double >, Devices::Cuda, int >,
+   Array< CustomScalar< int >, Devices::Cuda, long >,  // the reduction kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, long >,
+   Array< long,           Devices::Cuda, long >,
+   Array< double,         Devices::Cuda, long >
+   //Array< Quad< float >,  Devices::Cuda, long >,
+   //Array< Quad< double >, Devices::Cuda, long >
 #endif
+>;
 
-using namespace TNL;
+TYPED_TEST_SUITE( ReduceTest, ArrayTypes );
 
-#ifdef HAVE_GTEST
+template< typename Array >
+void iota( Array& array, typename Array::ValueType start = 0 )
+{
+   array.forAllElements( [start] __cuda_callable__
+                         ( typename Array::IndexType idx, typename Array::ValueType& value )
+                         { value = idx + start; }
+                       );
+}
+
+template< typename Array >
+void mod( Array& array, typename Array::IndexType mod = 0 )
+{
+   array.forAllElements( [mod] __cuda_callable__
+                         ( typename Array::IndexType idx, typename Array::ValueType& value )
+                         { value = idx % mod; }
+                       );
+}
 
-template< typename Device >
-void ReduceTest_sum()
+TYPED_TEST( ReduceTest, sum )
 {
-   using Array = Containers::Array< int, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
       a.setValue( 1 );
-      auto a_view = a.getView();
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus{} );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Plus{} );
       EXPECT_EQ( res, size );
+
+      res = reduce( a, TNL::Plus{} );
+      EXPECT_EQ( res, size );
+   }
+
+   const int size = 9377;
+   a.setSize( size );
+   iota( a );
+   auto res = reduce( a, TNL::Plus{} );
+   EXPECT_EQ( res, (size * (size - 1)) / 2 );
+}
+
+TYPED_TEST( ReduceTest, product )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   a.setSize( 10 );
+   a.setValue( 2 );
+
+   int result = 1;
+   for( int size = 0; size < a.getSize(); size++ )
+   {
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Multiplies{} );
+      EXPECT_EQ( res, result );
+      result *= 2;
    }
 }
 
-template< typename Device >
-void ReduceTest_min()
+TYPED_TEST( ReduceTest, min )
 {
-   using Array = Containers::Array< int, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
+      iota( a, 1 );
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min{} );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Min{} );
       EXPECT_EQ( res, 1 );
    }
 }
 
-template< typename Device >
-void ReduceTest_max()
+TYPED_TEST( ReduceTest, max )
 {
-   using Array = Containers::Array< int, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
+      iota( a, 1 );
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max{} );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Max{} );
       EXPECT_EQ( res, size );
    }
 }
 
-template< typename Device >
-void ReduceTest_minWithArg()
+TYPED_TEST( ReduceTest, minWithArg )
 {
-   using Array = Containers::Array< int, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
+      iota( a, 1 );
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg{} );
+      auto res = reduceWithArgument< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::MinWithArg{} );
       EXPECT_EQ( res.first, 1 );
       EXPECT_EQ( res.second, 0 );
    }
 }
 
-template< typename Device >
-void ReduceTest_maxWithArg()
+TYPED_TEST( ReduceTest, maxWithArg )
 {
-   using Array = Containers::Array< int, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
+      iota( a, 1 );
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg{} );
+      auto res = reduceWithArgument< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::MaxWithArg{} );
       EXPECT_EQ( res.first, size );
       EXPECT_EQ( res.second, size - 1 );
    }
 }
 
-template< typename Device >
-void ReduceTest_logicalAnd()
+TYPED_TEST( ReduceTest, logicalAnd )
 {
-   using Array = Containers::Array< bool, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
-      auto a_view = a.getView();
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd{} );
+      mod( a, 2 );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalAnd{} );
       EXPECT_EQ( res, false );
-   }
-}
 
-template< typename Device >
-void ReduceTest_logicalOr()
-{
-   using Array = Containers::Array< bool, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr{} );
+      a.setValue( 1 );
+      res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalAnd{} );
       EXPECT_EQ( res, true );
    }
 }
 
-template< typename Device >
-void ReduceTest_bitAnd()
+TYPED_TEST( ReduceTest, logicalOr )
 {
-   using Array = Containers::Array< char, Device >;
-   Array a;
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
-      auto a_view = a.getView();
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd{} );
-      EXPECT_EQ( res, 1 );
+      mod( a, 2 );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalOr{} );
+      EXPECT_EQ( res, true );
+
+      a.setValue( 0 );
+      res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalOr{} );
+      EXPECT_EQ( res, false );
    }
 }
 
-template< typename Device >
-void ReduceTest_bitOr()
+// bitwise AND (&) is not defined for floating-point types
+template< typename ArrayType >
+std::enable_if_t< std::is_integral< typename ArrayType::ValueType >::value >
+test_bitAnd( ArrayType& a )
 {
-   using Array = Containers::Array< char, Device >;
-   Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 << ( idx % 8 );} );
-      auto a_view = a.getView();
+      a.forAllElements( [] __cuda_callable__ ( typename ArrayType::IndexType idx, typename ArrayType::ValueType& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
 
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr{} );
-      EXPECT_EQ( res, ( char ) 255 );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::BitAnd{} );
+      EXPECT_EQ( res, 1 );
    }
 }
 
-// test fixture for typed tests
-template< typename Device >
-class ReduceTest : public ::testing::Test
-{
-protected:
-   using DeviceType = Device;
-};
-
-// types for which ArrayTest is instantiated
-using DeviceTypes = ::testing::Types<
-   Devices::Host
-#ifdef HAVE_CUDA
-   ,Devices::Cuda
-#endif
-   >;
-
-TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
-
-TYPED_TEST( ReduceTest, sum )
+template< typename ArrayType >
+std::enable_if_t< ! std::is_integral< typename ArrayType::ValueType >::value >
+test_bitAnd( ArrayType& a )
 {
-   ReduceTest_sum< typename TestFixture::DeviceType >();
 }
 
-TYPED_TEST( ReduceTest, min )
-{
-   ReduceTest_min< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, max )
-{
-   ReduceTest_max< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, minWithArg )
-{
-   ReduceTest_minWithArg< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, maxWithArg )
+TYPED_TEST( ReduceTest, bitAnd )
 {
-   ReduceTest_maxWithArg< typename TestFixture::DeviceType >();
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   test_bitAnd( a );
 }
 
-TYPED_TEST( ReduceTest, logicalAnd )
+// bitwise OR (|) is not defined for floating-point types
+template< typename ArrayType >
+std::enable_if_t< std::is_integral< typename ArrayType::ValueType >::value >
+test_bitOr( ArrayType& a )
 {
-   ReduceTest_logicalAnd< typename TestFixture::DeviceType >();
-}
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( typename ArrayType::IndexType idx, typename ArrayType::ValueType& value ) { value = 1 << ( idx % 8 );} );
 
-TYPED_TEST( ReduceTest, logicalOr )
-{
-   ReduceTest_logicalOr< typename TestFixture::DeviceType >();
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::BitOr{} );
+      EXPECT_EQ( res, 255 );
+   }
 }
 
-TYPED_TEST( ReduceTest, bitAnd )
+template< typename ArrayType >
+std::enable_if_t< ! std::is_integral< typename ArrayType::ValueType >::value >
+test_bitOr( ArrayType& a )
 {
-   ReduceTest_bitAnd< typename TestFixture::DeviceType >();
 }
 
 TYPED_TEST( ReduceTest, bitOr )
 {
-   ReduceTest_bitOr< typename TestFixture::DeviceType >();
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   test_bitOr( a );
 }
 
 #endif
diff --git a/src/UnitTests/Algorithms/reduceTest.cu b/src/UnitTests/Algorithms/reduceTestCuda.cu
similarity index 100%
rename from src/UnitTests/Algorithms/reduceTest.cu
rename to src/UnitTests/Algorithms/reduceTestCuda.cu
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
index d59a7db99..9611d7acd 100644
--- a/src/UnitTests/Algorithms/scanTest.h
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -7,6 +7,7 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/scan.h>
+#include "../CustomScalar.h"
 
 using namespace TNL;
 using namespace TNL::Containers;
@@ -95,32 +96,38 @@ protected:
 // TODO: Quad must be fixed
 using ArrayTypes = ::testing::Types<
 #ifndef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Sequential, short >,
    Array< int,            Devices::Sequential, short >,
    Array< long,           Devices::Sequential, short >,
    Array< double,         Devices::Sequential, short >,
    //Array< Quad< float >,  Devices::Sequential, short >,
    //Array< Quad< double >, Devices::Sequential, short >,
+   Array< CustomScalar< int >, Devices::Sequential, int >,
    Array< int,            Devices::Sequential, int >,
    Array< long,           Devices::Sequential, int >,
    Array< double,         Devices::Sequential, int >,
    //Array< Quad< float >,  Devices::Sequential, int >,
    //Array< Quad< double >, Devices::Sequential, int >,
+   Array< CustomScalar< int >, Devices::Sequential, long >,
    Array< int,            Devices::Sequential, long >,
    Array< long,           Devices::Sequential, long >,
    Array< double,         Devices::Sequential, long >,
    //Array< Quad< float >,  Devices::Sequential, long >,
    //Array< Quad< double >, Devices::Sequential, long >,
 
+   Array< CustomScalar< int >, Devices::Host, short >,
    Array< int,            Devices::Host, short >,
    Array< long,           Devices::Host, short >,
    Array< double,         Devices::Host, short >,
    //Array< Quad< float >,  Devices::Host, short >,
    //Array< Quad< double >, Devices::Host, short >,
+   Array< CustomScalar< int >, Devices::Host, int >,
    Array< int,            Devices::Host, int >,
    Array< long,           Devices::Host, int >,
    Array< double,         Devices::Host, int >,
    //Array< Quad< float >,  Devices::Host, int >,
    //Array< Quad< double >, Devices::Host, int >,
+   Array< CustomScalar< int >, Devices::Host, long >,
    Array< int,            Devices::Host, long >,
    Array< long,           Devices::Host, long >,
    Array< double,         Devices::Host, long >
@@ -128,16 +135,19 @@ using ArrayTypes = ::testing::Types<
    //Array< Quad< double >, Devices::Host, long >
 #endif
 #ifdef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Cuda, short >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
    Array< int,            Devices::Cuda, short >,
    Array< long,           Devices::Cuda, short >,
    Array< double,         Devices::Cuda, short >,
    //Array< Quad< float >,  Devices::Cuda, short >,
    //Array< Quad< double >, Devices::Cuda, short >,
+   Array< CustomScalar< int >, Devices::Cuda, int >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
    Array< int,            Devices::Cuda, int >,
    Array< long,           Devices::Cuda, int >,
    Array< double,         Devices::Cuda, int >,
    //Array< Quad< float >,  Devices::Cuda, int >,
    //Array< Quad< double >, Devices::Cuda, int >,
+   Array< CustomScalar< int >, Devices::Cuda, long >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
    Array< int,            Devices::Cuda, long >,
    Array< long,           Devices::Cuda, long >,
    Array< double,         Devices::Cuda, long >
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index d50b682ba..03c521cec 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -8,6 +8,7 @@ ADD_SUBDIRECTORY( Pointers )
 
 set( CPP_TESTS  AssertTest
                 base64Test
+                CustomScalarTest
                 FileNameTest
                 MathTest
                 ObjectTest
diff --git a/src/UnitTests/CustomScalar.h b/src/UnitTests/CustomScalar.h
new file mode 100644
index 000000000..5e957750a
--- /dev/null
+++ b/src/UnitTests/CustomScalar.h
@@ -0,0 +1,267 @@
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include <TNL/Math.h>
+#include <TNL/MPI/getDataType.h>
+
+namespace TNL {
+
+template< class T >
+struct CustomScalar
+{
+   T value = 0;
+
+public:
+   constexpr CustomScalar() = default;
+
+   constexpr CustomScalar( T value ) : value( value ) {}
+
+   template< typename S >
+   constexpr CustomScalar( const CustomScalar< S >& v ) : value( v.value ) {}
+
+   constexpr CustomScalar( const CustomScalar& ) = default;
+
+   constexpr CustomScalar( CustomScalar&& ) = default;
+
+   constexpr CustomScalar& operator=( const CustomScalar& v ) = default;
+
+   constexpr CustomScalar& operator=( CustomScalar&& v ) = default;
+
+#define MAKE_ASSIGNMENT_OP(op) \
+   template< typename S >                                            \
+   constexpr CustomScalar& operator op( const CustomScalar< S >& v ) \
+   {                                                                 \
+      value op v.value;                                              \
+      return *this;                                                  \
+   }                                                                 \
+   template< typename S >                                            \
+   constexpr CustomScalar& operator op( const S& v )                 \
+   {                                                                 \
+      value op v;                                                    \
+      return *this;                                                  \
+   }                                                                 \
+
+   MAKE_ASSIGNMENT_OP(+=)
+   MAKE_ASSIGNMENT_OP(-=)
+   MAKE_ASSIGNMENT_OP(*=)
+   MAKE_ASSIGNMENT_OP(/=)
+   MAKE_ASSIGNMENT_OP(%=)
+   MAKE_ASSIGNMENT_OP(&=)
+   MAKE_ASSIGNMENT_OP(|=)
+   MAKE_ASSIGNMENT_OP(^=)
+   MAKE_ASSIGNMENT_OP(<<=)
+   MAKE_ASSIGNMENT_OP(>>=)
+
+#undef MAKE_ASSIGNMENT_OP
+
+   // bitwise negation
+   constexpr bool operator~() const
+   {
+      return ~ value;
+   }
+
+   // logical negation
+   constexpr bool operator!() const
+   {
+      return ! value;
+   }
+
+   // unary plus (integer promotion)
+   constexpr auto operator+() const -> CustomScalar< decltype(+value) >
+   {
+      return +value;
+   }
+
+   // unary minus (additive inverse)
+   constexpr CustomScalar operator-() const
+   {
+      return -value;
+   }
+
+   // prefix increment
+   constexpr CustomScalar& operator++()
+   {
+      ++value;
+      return *this;
+   }
+
+   // prefix decrement
+   constexpr CustomScalar& operator--()
+   {
+      --value;
+      return *this;
+   }
+
+   // postfix increment
+   constexpr CustomScalar operator++(int)
+   {
+      CustomScalar result = *this;
+      value++;
+      return result;
+   }
+
+   // postfix decrement
+   constexpr CustomScalar operator--(int)
+   {
+      CustomScalar result = *this;
+      value--;
+      return result;
+   }
+
+   // cast to T
+   constexpr operator T() const
+   {
+      return value;
+   }
+};
+
+#define MAKE_BINARY_OP(op)                                              \
+template< class T, class S >                                            \
+constexpr auto operator op( const CustomScalar< T >& v1,                \
+                            const CustomScalar< S >& v2 )               \
+   -> CustomScalar< decltype( v1.value op v2.value ) >                  \
+{                                                                       \
+   return v1.value op v2.value;                                         \
+}                                                                       \
+template< class T, class S >                                            \
+constexpr auto operator op( const CustomScalar< T >& v1, const S& v2 )  \
+   -> CustomScalar< decltype( v1.value op v2 ) >                        \
+{                                                                       \
+   return v1.value op v2;                                               \
+}                                                                       \
+template< class S, class T >                                            \
+constexpr auto operator op( const S& v1, const CustomScalar< T >& v2 )  \
+   -> CustomScalar< decltype( v1 op v2.value ) >                        \
+{                                                                       \
+   return v1 op v2.value;                                               \
+}                                                                       \
+
+MAKE_BINARY_OP(+)
+MAKE_BINARY_OP(-)
+MAKE_BINARY_OP(*)
+MAKE_BINARY_OP(/)
+MAKE_BINARY_OP(%)
+MAKE_BINARY_OP(&)
+MAKE_BINARY_OP(|)
+MAKE_BINARY_OP(^)
+MAKE_BINARY_OP(<<)
+MAKE_BINARY_OP(>>)
+
+#undef MAKE_BINARY_OP
+
+#define MAKE_BOOL_BINARY_OP(op)                                         \
+template< class T, class S >                                            \
+constexpr bool operator op( const CustomScalar< T >& v1,                \
+                            const CustomScalar< S >& v2 )               \
+{                                                                       \
+   return v1.value op v2.value;                                         \
+}                                                                       \
+template< class T, class S >                                            \
+constexpr bool operator op( const CustomScalar< T >& v1, const S& v2 )  \
+{                                                                       \
+   return v1.value op v2;                                               \
+}                                                                       \
+template< class S, class T >                                            \
+constexpr bool operator op( const S& v1, const CustomScalar< T >& v2 )  \
+{                                                                       \
+   return v1 op v2.value;                                               \
+}                                                                       \
+
+MAKE_BOOL_BINARY_OP(==)
+MAKE_BOOL_BINARY_OP(!=)
+MAKE_BOOL_BINARY_OP(<=)
+MAKE_BOOL_BINARY_OP(>=)
+MAKE_BOOL_BINARY_OP(<)
+MAKE_BOOL_BINARY_OP(>)
+MAKE_BOOL_BINARY_OP(&&)
+MAKE_BOOL_BINARY_OP(||)
+
+#undef MAKE_BOOL_BINARY_OP
+
+template< class T >
+std::istream& operator>>( std::istream& str, const CustomScalar< T >& v )
+{
+   return str >> v.value;
+}
+
+template< class T >
+std::ostream& operator<<( std::ostream& str, const CustomScalar< T >& v )
+{
+   return str << v.value;
+}
+
+#define MAKE_UNARY_FUNC(fname)                              \
+   template< class T >                                      \
+   constexpr auto fname ( const CustomScalar< T >& v )      \
+      -> CustomScalar< decltype(TNL::fname( v.value )) >    \
+   { return TNL::fname( v.value ); }                        \
+
+#define MAKE_BINARY_FUNC(fname)                                                     \
+   template< class T, class S >                                                     \
+   constexpr auto fname ( const CustomScalar< T >& v, const CustomScalar< S >& w )  \
+      -> CustomScalar< decltype(TNL::fname( v.value, w.value )) >                   \
+   { return TNL::fname( v.value, w.value ); }                                       \
+   template< class T, class S >                                                     \
+   constexpr auto fname ( const CustomScalar< T >& v, const S& w )                  \
+      -> CustomScalar< decltype(TNL::fname( v.value, w )) >                         \
+   { return TNL::fname( v.value, w ); }                                             \
+   template< class S, class T >                                                     \
+   constexpr auto fname ( const S& w, const CustomScalar< T >& v )                  \
+      -> CustomScalar< decltype(TNL::fname( w, v.value )) >                         \
+   { return TNL::fname( w, v.value ); }                                             \
+
+MAKE_UNARY_FUNC( abs )
+MAKE_UNARY_FUNC( sqrt )
+MAKE_UNARY_FUNC( cbrt )
+MAKE_UNARY_FUNC( exp )
+MAKE_UNARY_FUNC( log )
+MAKE_UNARY_FUNC( log10 )
+MAKE_UNARY_FUNC( log2 )
+MAKE_UNARY_FUNC( sin )
+MAKE_UNARY_FUNC( cos )
+MAKE_UNARY_FUNC( tan )
+MAKE_UNARY_FUNC( asin )
+MAKE_UNARY_FUNC( acos )
+MAKE_UNARY_FUNC( atan )
+MAKE_UNARY_FUNC( sinh )
+MAKE_UNARY_FUNC( cosh )
+MAKE_UNARY_FUNC( tanh )
+MAKE_UNARY_FUNC( asinh )
+MAKE_UNARY_FUNC( acosh )
+MAKE_UNARY_FUNC( atanh )
+MAKE_UNARY_FUNC( floor )
+MAKE_UNARY_FUNC( ceil )
+
+MAKE_BINARY_FUNC( min )
+MAKE_BINARY_FUNC( max )
+MAKE_BINARY_FUNC( argMin )
+MAKE_BINARY_FUNC( argMax )
+MAKE_BINARY_FUNC( argAbsMin )
+MAKE_BINARY_FUNC( argAbsMax )
+MAKE_BINARY_FUNC( pow )
+
+#undef MAKE_UNARY_FUNC
+#undef MAKE_BINARY_FUNC
+
+} // namespace TNL
+
+namespace std {
+   template< typename T >
+   struct numeric_limits< TNL::CustomScalar< T > > : public numeric_limits< T > {};
+} // namespace std
+
+namespace TNL {
+   template< typename T >
+   struct IsScalarType< CustomScalar< T > > : public std::true_type {};
+} // namespace TNL
+
+#ifdef HAVE_MPI
+namespace TNL {
+namespace MPI {
+   template< typename T >
+   struct TypeResolver< CustomScalar< T > > : public TypeResolver< T > {};
+} // namespace MPI
+} // namespace TNL
+#endif
diff --git a/src/UnitTests/CustomScalarTest.cpp b/src/UnitTests/CustomScalarTest.cpp
new file mode 100644
index 000000000..c650e050f
--- /dev/null
+++ b/src/UnitTests/CustomScalarTest.cpp
@@ -0,0 +1,43 @@
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include "CustomScalar.h"
+
+using scalar = TNL::CustomScalar< int >;
+
+TEST( CustomScalarTest, comparison )
+{
+   scalar a = 1;
+   EXPECT_EQ( a, 1 );
+   EXPECT_EQ( 1, a );
+   EXPECT_NE( a, 2 );
+   EXPECT_NE( 2, a );
+   EXPECT_LE( a, 1 );
+   EXPECT_LE( 1, a );
+   EXPECT_GE( a, 1 );
+   EXPECT_GE( 1, a );
+   EXPECT_LT( a, 2 );
+   EXPECT_LT( 0, a );
+   EXPECT_GT( a, 0 );
+   EXPECT_GT( 2, a );
+
+   scalar b = 1.0;
+   EXPECT_EQ( b, 1.0 );
+   EXPECT_EQ( 1.0, b );
+   EXPECT_NE( b, 2.0 );
+   EXPECT_NE( 2.0, b );
+   EXPECT_LE( b, 1.0 );
+   EXPECT_LE( 1.0, b );
+   EXPECT_GE( b, 1.0 );
+   EXPECT_GE( 1.0, b );
+   EXPECT_LT( b, 2.0 );
+   EXPECT_LT( 0.0, b );
+   EXPECT_GT( b, 0.0 );
+   EXPECT_GT( 2.0, b );
+}
+
+// TODO: test the other operators
+
+#endif
+
+#include "main.h"
-- 
GitLab


From 4f2c112a923166a2d2eb043c0128f55b3b12ec3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 5 Aug 2021 10:28:02 +0200
Subject: [PATCH 48/52] Added tests of expression templates with CustomScalar

---
 .../DistributedExpressionTemplates.h          | 12 +++-
 .../Expressions/ExpressionTemplates.h         | 12 +++-
 .../Expressions/HorizontalOperations.h        | 12 +++-
 .../Expressions/StaticExpressionTemplates.h   | 13 +++-
 .../Containers/VectorBinaryOperationsTest.h   | 68 +++++++++----------
 .../Containers/VectorHelperFunctions.h        | 15 ++--
 .../Containers/VectorUnaryOperationsTest.h    | 38 +++++++----
 .../Containers/VectorVerticalOperationsTest.h | 21 ++++--
 8 files changed, 120 insertions(+), 71 deletions(-)

diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 093547d0b..bb1c0fa8b 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -516,6 +516,16 @@ dot( const ET1& a, const ET2& b )
    return (a, b);
 }
 
+////
+// Unary expression plus
+template< typename ET1,
+          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+auto
+operator+( const ET1& a )
+{
+   return DistributedUnaryExpressionTemplate< ET1, UnaryPlus >( a );
+}
+
 ////
 // Unary expression minus
 template< typename ET1,
@@ -523,7 +533,7 @@ template< typename ET1,
 auto
 operator-( const ET1& a )
 {
-   return DistributedUnaryExpressionTemplate< ET1, Minus >( a );
+   return DistributedUnaryExpressionTemplate< ET1, UnaryMinus >( a );
 }
 
 ////
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 3e01255fc..05b22fab1 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -381,6 +381,16 @@ dot( const ET1& a, const ET2& b )
    return (a, b);
 }
 
+////
+// Unary expression plus
+template< typename ET1,
+          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+auto
+operator+( const ET1& a )
+{
+   return UnaryExpressionTemplate< ET1, UnaryPlus >( a );
+}
+
 ////
 // Unary expression minus
 template< typename ET1,
@@ -388,7 +398,7 @@ template< typename ET1,
 auto
 operator-( const ET1& a )
 {
-   return UnaryExpressionTemplate< ET1, Minus >( a );
+   return UnaryExpressionTemplate< ET1, UnaryMinus >( a );
 }
 
 ////
diff --git a/src/TNL/Containers/Expressions/HorizontalOperations.h b/src/TNL/Containers/Expressions/HorizontalOperations.h
index 614f2c878..8000243d8 100644
--- a/src/TNL/Containers/Expressions/HorizontalOperations.h
+++ b/src/TNL/Containers/Expressions/HorizontalOperations.h
@@ -76,7 +76,17 @@ struct Max
    }
 };
 
-struct Minus
+struct UnaryPlus
+{
+   template< typename T1 >
+   __cuda_callable__
+   static auto evaluate( const T1& a ) -> decltype( +a )
+   {
+      return +a;
+   }
+};
+
+struct UnaryMinus
 {
    template< typename T1 >
    __cuda_callable__
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index 3709b5630..006a4a36a 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -387,6 +387,17 @@ dot( const ET1& a, const ET2& b )
    return (a, b);
 }
 
+////
+// Unary expression plus
+template< typename ET1,
+          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+__cuda_callable__
+auto
+operator+( const ET1& a )
+{
+   return StaticUnaryExpressionTemplate< ET1, UnaryPlus >( a );
+}
+
 ////
 // Unary expression minus
 template< typename ET1,
@@ -395,7 +406,7 @@ __cuda_callable__
 auto
 operator-( const ET1& a )
 {
-   return StaticUnaryExpressionTemplate< ET1, Minus >( a );
+   return StaticUnaryExpressionTemplate< ET1, UnaryMinus >( a );
 }
 
 ////
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index b79b675cf..341418f85 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -163,8 +164,8 @@ protected:
             DistributedVectorView< short, Devices::Host, int > >,
       Pair< DistributedVectorView< int,   Devices::Host, int >,
             DistributedVector<     short, Devices::Host, int > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int >,
-            DistributedVectorView< short, Devices::Host, int > >
+      Pair< DistributedVectorView< CustomScalar< int >,   Devices::Host, int >,
+            DistributedVectorView< CustomScalar< short >, Devices::Host, int > >
    #else
       Pair< DistributedVector<     int,   Devices::Cuda, int >,
             DistributedVector<     short, Devices::Cuda, int > >,
@@ -172,8 +173,8 @@ protected:
             DistributedVectorView< short, Devices::Cuda, int > >,
       Pair< DistributedVectorView< int,   Devices::Cuda, int >,
             DistributedVector<     short, Devices::Cuda, int > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
-            DistributedVectorView< short, Devices::Cuda, int > >
+      Pair< DistributedVectorView< CustomScalar< int >,   Devices::Cuda, int >,
+            DistributedVectorView< CustomScalar< short >, Devices::Cuda, int > >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -183,20 +184,21 @@ protected:
          Pair< StaticVector< 2, StaticVector< 3, int > >,  StaticVector< 2, StaticVector< 3, short > > >,
          Pair< StaticVector< 3, StaticVector< 3, int > >,  StaticVector< 3, StaticVector< 3, short > > >,
          Pair< StaticVector< 4, StaticVector< 3, int > >,  StaticVector< 4, StaticVector< 3, short > > >,
-         Pair< StaticVector< 5, StaticVector< 3, int > >,  StaticVector< 5, StaticVector< 3, short > > >
+         Pair< StaticVector< 5, StaticVector< 3, int > >,  StaticVector< 5, StaticVector< 3, short > > >,
+         Pair< StaticVector< 5, StaticVector< 3, CustomScalar< int > > >,  StaticVector< 5, StaticVector< 3, CustomScalar< short > > > >
       >;
    #else
       using VectorPairs = ::testing::Types<
-         Pair< StaticVector< 1, int >,     StaticVector< 1, short >    >,
+         Pair< StaticVector< 1, int    >,  StaticVector< 1, short  > >,
          Pair< StaticVector< 1, double >,  StaticVector< 1, double > >,
-         Pair< StaticVector< 2, int >,     StaticVector< 2, short >    >,
+         Pair< StaticVector< 2, int    >,  StaticVector< 2, short  > >,
          Pair< StaticVector< 2, double >,  StaticVector< 2, double > >,
-         Pair< StaticVector< 3, int >,     StaticVector< 3, short >    >,
+         Pair< StaticVector< 3, int    >,  StaticVector< 3, short  > >,
          Pair< StaticVector< 3, double >,  StaticVector< 3, double > >,
-         Pair< StaticVector< 4, int >,     StaticVector< 4, short >    >,
+         Pair< StaticVector< 4, int    >,  StaticVector< 4, short  > >,
          Pair< StaticVector< 4, double >,  StaticVector< 4, double > >,
-         Pair< StaticVector< 5, int >,     StaticVector< 5, short >    >,
-         Pair< StaticVector< 5, double >,  StaticVector< 5, double > >
+         Pair< StaticVector< 5, int    >,  StaticVector< 5, CustomScalar< short > > >,
+         Pair< StaticVector< 5, double >,  StaticVector< 5, CustomScalar< double > > >
       >;
    #endif
 #else
@@ -217,33 +219,25 @@ protected:
    #else
       using VectorPairs = ::testing::Types<
       #ifndef HAVE_CUDA
-         Pair< Vector<     int,       Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< Vector<     int,       Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< Vector<     int,       Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< Vector<     double,    Devices::Host >, Vector<     double,    Devices::Host > >,
-         Pair< VectorView< double,    Devices::Host >, Vector<     double,    Devices::Host > >,
-         Pair< Vector<     double,    Devices::Host >, VectorView< double,    Devices::Host > >,
-         Pair< VectorView< double,    Devices::Host >, VectorView< double,    Devices::Host > >
+         Pair< Vector<     int,                 Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< VectorView< int,                 Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Host >, VectorView< CustomScalar< double >,       Devices::Host > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Host >, VectorView< const CustomScalar< double >, Devices::Host > >,
+         Pair< VectorView< CustomScalar< int >, Devices::Host >, VectorView< CustomScalar< double >,       Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, VectorView< int,                          Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, VectorView< const int,                    Devices::Host > >,
+         Pair< VectorView< int,                 Devices::Host >, VectorView< const int,                    Devices::Host > >
       #else
-         Pair< Vector<     int,       Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< Vector<     int,       Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< Vector<     int,       Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< Vector<     double,    Devices::Cuda >, Vector<     double,    Devices::Cuda > >,
-         Pair< VectorView< double,    Devices::Cuda >, Vector<     double,    Devices::Cuda > >,
-         Pair< Vector<     double,    Devices::Cuda >, VectorView< double,    Devices::Cuda > >,
-         Pair< VectorView< double,    Devices::Cuda >, VectorView< double,    Devices::Cuda > >
+         Pair< Vector<     int,                 Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< VectorView< int,                 Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Cuda >, VectorView< CustomScalar< double >,       Devices::Cuda > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Cuda >, VectorView< const CustomScalar< double >, Devices::Cuda > >,
+         Pair< VectorView< CustomScalar< int >, Devices::Cuda >, VectorView< CustomScalar< double >,       Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, VectorView< int,                          Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, VectorView< const int,                    Devices::Cuda > >,
+         Pair< VectorView< int,                 Devices::Cuda >, VectorView< const int,                    Devices::Cuda > >
       #endif
       >;
    #endif
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index f0d67076a..1915f535d 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -28,11 +28,10 @@ void setLinearSequence( Vector& deviceVector )
    deviceVector = a;
 }
 
-template< typename Vector >
-void setConstantSequence( Vector& deviceVector,
-                          typename Vector::ValueType v )
+template< typename Vector, typename Value >
+void setConstantSequence( Vector& deviceVector, Value v )
 {
-   deviceVector.setValue( v );
+   deviceVector.setValue( typename Vector::ValueType( v ) );
 }
 
 template< typename Vector >
@@ -47,8 +46,7 @@ void setOscilatingLinearSequence( Vector& deviceVector )
 }
 
 template< typename Vector >
-void setOscilatingConstantSequence( Vector& deviceVector,
-                                    typename Vector::ValueType v )
+void setOscilatingConstantSequence( Vector& deviceVector )
 {
    using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
@@ -78,9 +76,8 @@ void setNegativeLinearSequence( Vector& deviceVector )
    deviceVector = a;
 }
 
-template< typename Vector >
-void setOscilatingSequence( Vector& deviceVector,
-                            typename Vector::ValueType v )
+template< typename Vector, typename Value >
+void setOscilatingSequence( Vector& deviceVector, Value v )
 {
 #ifdef STATIC_VECTOR
    Vector a;
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index 43e2e2687..eb3c65633 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -76,11 +77,13 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int >,
       DistributedVectorView<       double, Devices::Host, int >,
-      DistributedVectorView< const double, Devices::Host, int >
+      DistributedVectorView< const double, Devices::Host, int >,
+      DistributedVector< CustomScalar< double >, Devices::Host, int >
    #else
       DistributedVector<           double, Devices::Cuda, int >,
       DistributedVectorView<       double, Devices::Cuda, int >,
-      DistributedVectorView< const double, Devices::Cuda, int >
+      DistributedVectorView< const double, Devices::Cuda, int >,
+      DistributedVector< CustomScalar< double >, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -90,7 +93,7 @@ protected:
          StaticVector< 2, StaticVector< 3, double > >,
          StaticVector< 3, StaticVector< 3, double > >,
          StaticVector< 4, StaticVector< 3, double > >,
-         StaticVector< 5, StaticVector< 3, double > >
+         StaticVector< 5, StaticVector< 3, CustomScalar< double > > >
       >;
    #else
       using VectorTypes = ::testing::Types<
@@ -102,8 +105,8 @@ protected:
          StaticVector< 3, double >,
          StaticVector< 4, int >,
          StaticVector< 4, double >,
-         StaticVector< 5, int >,
-         StaticVector< 5, double >
+         StaticVector< 5, CustomScalar< int > >,
+         StaticVector< 5, CustomScalar< double > >
       >;
    #endif
 #else
@@ -111,10 +114,12 @@ protected:
       using VectorTypes = ::testing::Types<
       #ifndef HAVE_CUDA
          Vector<     StaticVector< 3, double >, Devices::Host >,
-         VectorView< StaticVector< 3, double >, Devices::Host >
+         VectorView< StaticVector< 3, double >, Devices::Host >,
+         VectorView< StaticVector< 3, CustomScalar< double > >, Devices::Host >
       #else
          Vector<     StaticVector< 3, double >, Devices::Cuda >,
-         VectorView< StaticVector< 3, double >, Devices::Cuda >
+         VectorView< StaticVector< 3, double >, Devices::Cuda >,
+         VectorView< StaticVector< 3, CustomScalar< double > >, Devices::Cuda >
       #endif
       >;
    #else
@@ -124,14 +129,18 @@ protected:
          VectorView< int,       Devices::Host >,
          VectorView< const int, Devices::Host >,
          Vector<     double,    Devices::Host >,
-         VectorView< double,    Devices::Host >
+         VectorView< double,    Devices::Host >,
+         Vector<     CustomScalar< int >, Devices::Host >,
+         VectorView< CustomScalar< int >, Devices::Host >
       #endif
       #ifdef HAVE_CUDA
          Vector<     int,       Devices::Cuda >,
          VectorView< int,       Devices::Cuda >,
          VectorView< const int, Devices::Cuda >,
          Vector<     double,    Devices::Cuda >,
-         VectorView< double,    Devices::Cuda >
+         VectorView< double,    Devices::Cuda >,
+         Vector<     CustomScalar< int >, Devices::Cuda >,
+         VectorView< CustomScalar< int >, Devices::Cuda >
       #endif
       >;
    #endif
@@ -164,7 +173,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / _size;         \
       for( int i = 0; i < _size; i++ )                         \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          V1[ i ] = x;                                          \
          expected[ i ] = function(x);                          \
       }                                                        \
@@ -209,7 +218,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / size;          \
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
@@ -229,11 +238,12 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
    #define SETUP_UNARY_VECTOR_TEST( size ) \
       using VectorType = typename TestFixture::VectorType;     \
       using VectorOrView = typename TestFixture::VectorOrView; \
+      using ValueType = typename VectorType::ValueType;        \
                                                                \
       VectorType _V1( size ), _V2( size );                     \
                                                                \
-      _V1 = 1;                                                 \
-      _V2 = 2;                                                 \
+      _V1 = ValueType( 1 );                                    \
+      _V2 = ValueType( 2 );                                    \
                                                                \
       VectorOrView V1( _V1 ), V2( _V2 );                       \
 
@@ -251,7 +261,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / size;          \
       for( int i = 0; i < size; i++ )                          \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index f73b502cc..b201f563d 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -112,11 +113,13 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int >,
       DistributedVectorView<       double, Devices::Host, int >,
-      DistributedVectorView< const double, Devices::Host, int >
+      DistributedVectorView< const double, Devices::Host, int >,
+      DistributedVector< CustomScalar< double >, Devices::Host, int >
    #else
       DistributedVector<           double, Devices::Cuda, int >,
       DistributedVectorView<       double, Devices::Cuda, int >,
-      DistributedVectorView< const double, Devices::Cuda, int >
+      DistributedVectorView< const double, Devices::Cuda, int >,
+      DistributedVector< CustomScalar< double >, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -126,7 +129,7 @@ protected:
          StaticVector< 2, StaticVector< 3, double > >,
          StaticVector< 3, StaticVector< 3, double > >,
          StaticVector< 4, StaticVector< 3, double > >,
-         StaticVector< 5, StaticVector< 3, double > >
+         StaticVector< 5, StaticVector< 3, CustomScalar< double > > >
       >;
    #else
       using VectorTypes = ::testing::Types<
@@ -138,8 +141,8 @@ protected:
          StaticVector< 3, double >,
          StaticVector< 4, int >,
          StaticVector< 4, double >,
-         StaticVector< 5, int >,
-         StaticVector< 5, double >
+         StaticVector< 5, CustomScalar< int > >,
+         StaticVector< 5, CustomScalar< double > >
       >;
    #endif
 #else
@@ -160,14 +163,18 @@ protected:
          VectorView< int,       Devices::Host >,
          VectorView< const int, Devices::Host >,
          Vector<     double,    Devices::Host >,
-         VectorView< double,    Devices::Host >
+         VectorView< double,    Devices::Host >,
+         Vector<     CustomScalar< int >, Devices::Host >,
+         VectorView< CustomScalar< int >, Devices::Host >
       #endif
       #ifdef HAVE_CUDA
          Vector<     int,       Devices::Cuda >,
          VectorView< int,       Devices::Cuda >,
          VectorView< const int, Devices::Cuda >,
          Vector<     double,    Devices::Cuda >,
-         VectorView< double,    Devices::Cuda >
+         VectorView< double,    Devices::Cuda >,
+         Vector<     CustomScalar< int >, Devices::Cuda >,
+         VectorView< CustomScalar< int >, Devices::Cuda >
       #endif
       >;
    #endif
-- 
GitLab


From 79a3009d19eb7432230f16beaf6224aec3eef765 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 6 Aug 2021 18:28:09 +0200
Subject: [PATCH 49/52] Refactored horizontal operations in expression
 templates using function objects

- structs from HorizontalOperations.h reimplemented as function objects in
  Functional.h
- repetitive function definitions generated using macros
- added new operators: % (modulus) and ^ (xor)
---
 .../DistributedExpressionTemplates.h          | 447 ++++------------
 .../DistributedVerticalOperations.h           |  15 +-
 .../Expressions/ExpressionTemplates.h         | 457 ++++------------
 .../Expressions/HorizontalOperations.h        | 345 -------------
 .../Expressions/StaticExpressionTemplates.h   | 487 ++++--------------
 .../Expressions/StaticVerticalOperations.h    |  10 +
 src/TNL/Containers/Expressions/TypeTraits.h   |  30 +-
 src/TNL/Functional.h                          | 256 +++++++--
 8 files changed, 579 insertions(+), 1468 deletions(-)
 delete mode 100644 src/TNL/Containers/Expressions/HorizontalOperations.h

diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index bb1c0fa8b..a713b00d4 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -56,7 +56,7 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -155,7 +155,7 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -237,7 +237,7 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
@@ -320,7 +320,7 @@ template< typename T1,
           typename Operation >
 struct DistributedUnaryExpressionTemplate
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -398,50 +398,86 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
-
-////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
+#define TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION(fname, functor)                                \
+   template< typename ET1,                                                                   \
+             typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >         \
+   auto                                                                                      \
+   fname( const ET1& a )                                                                     \
+   {                                                                                         \
+      return DistributedUnaryExpressionTemplate< ET1, functor >( a );                        \
+   }                                                                                         \
+
+#define TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1, typename ET2,                                                     \
+             typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >   \
+   auto                                                                                      \
+   fname( const ET1& a, const ET2& b )                                                       \
+   {                                                                                         \
+      return DistributedBinaryExpressionTemplate< ET1, ET2, functor >( a, b );               \
+   }                                                                                         \
+
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION
+#undef TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION
 
 ////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return DistributedBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return DistributedUnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator==( const ET1& a, const ET2& b )
 {
@@ -451,7 +487,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator!=( const ET1& a, const ET2& b )
 {
@@ -461,7 +497,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<( const ET1& a, const ET2& b )
 {
@@ -471,7 +507,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<=( const ET1& a, const ET2& b )
 {
@@ -481,7 +517,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>( const ET1& a, const ET2& b )
 {
@@ -491,7 +527,7 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>=( const ET1& a, const ET2& b )
 {
@@ -501,7 +537,7 @@ operator>=( const ET1& a, const ET2& b )
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 operator,( const ET1& a, const ET2& b )
 {
@@ -509,301 +545,17 @@ operator,( const ET1& a, const ET2& b )
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 dot( const ET1& a, const ET2& b )
 {
    return (a, b);
 }
 
-////
-// Unary expression plus
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-operator+( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, UnaryPlus >( a );
-}
-
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-operator-( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, UnaryMinus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-min( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-max( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-abs( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return DistributedBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-exp( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sqrt( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cbrt( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log10( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log2( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sin( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cos( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-tan( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-asin( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-acos( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-atan( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sinh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cosh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-tanh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-asinh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-acosh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-atanh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-floor( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-ceil( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sign( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation,
-          typename = void, typename = void >
-auto
-cast( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 min( const ET1& a )
 {
@@ -811,7 +563,7 @@ min( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 argMin( const ET1& a )
 {
@@ -819,7 +571,7 @@ argMin( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 max( const ET1& a )
 {
@@ -827,7 +579,7 @@ max( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 argMax( const ET1& a )
 {
@@ -835,7 +587,7 @@ argMax( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 sum( const ET1& a )
 {
@@ -843,7 +595,7 @@ sum( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 maxNorm( const ET1& a )
 {
@@ -851,7 +603,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 l1Norm( const ET1& a )
 {
@@ -859,7 +611,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 l2Norm( const ET1& a )
 {
@@ -869,7 +621,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 lpNorm( const ET1& a, const Real& p )
 // since (1.0 / p) has type double, TNL::pow returns double
@@ -884,7 +636,7 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 product( const ET1& a )
 {
@@ -892,7 +644,15 @@ product( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
+auto
+logicalAnd( const ET1& a )
+{
+   return DistributedExpressionLogicalAnd( a );
+}
+
+template< typename ET1,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 logicalOr( const ET1& a )
 {
@@ -900,15 +660,15 @@ logicalOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-logicalAnd( const ET1& a )
+binaryAnd( const ET1& a )
 {
-   return DistributedExpressionLogicalAnd( a );
+   return DistributedExpressionBinaryAnd( a );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 binaryOr( const ET1& a )
 {
@@ -916,11 +676,11 @@ binaryOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-binaryAnd( const ET1& a )
+binaryXor( const ET1& a )
 {
-   return DistributedExpressionBinaryAnd( a );
+   return DistributedExpressionBinaryXor( a );
 }
 
 ////
@@ -976,6 +736,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index 37bf2c868..f3b826a9f 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -174,7 +174,7 @@ auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::deca
 }
 
 template< typename Expression >
-auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
+auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] & expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
 
@@ -201,6 +201,19 @@ auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay
    return result;
 }
 
+template< typename Expression >
+auto DistributedExpressionBinaryXor( const Expression& expression ) -> std::decay_t< decltype( expression[0] ^ expression[0] ) >
+{
+   using ResultType = std::decay_t< decltype( expression[0] ^ expression[0] ) >;
+
+   ResultType result = 0;
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitXor{} );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BXOR, expression.getCommunicationGroup() );
+   }
+   return result;
+}
+
 } // namespace Expressions
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 05b22fab1..58d92609c 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -13,11 +13,11 @@
 #include <ostream>
 #include <utility>
 
+#include <TNL/Functional.h>
 #include <TNL/TypeTraits.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
 #include <TNL/Containers/Expressions/Comparison.h>
-#include <TNL/Containers/Expressions/HorizontalOperations.h>
 #include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
@@ -58,7 +58,7 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -80,13 +80,13 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ve
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1.getElement( i ), op2.getElement( i ) );
+      return Operation{}( op1.getElement( i ), op2.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1[ i ], op2[ i ] );
+      return Operation{}( op1[ i ], op2[ i ] );
    }
 
    __cuda_callable__
@@ -116,7 +116,7 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -130,13 +130,13 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ar
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1.getElement( i ), op2 );
+      return Operation{}( op1.getElement( i ), op2 );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1[ i ], op2 );
+      return Operation{}( op1[ i ], op2 );
    }
 
    __cuda_callable__
@@ -166,7 +166,7 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
@@ -180,13 +180,13 @@ struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorEx
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1, op2.getElement( i ) );
+      return Operation{}( op1, op2.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1, op2[ i ] );
+      return Operation{}( op1, op2[ i ] );
    }
 
    __cuda_callable__
@@ -217,7 +217,7 @@ template< typename T1,
           typename Operation >
 struct UnaryExpressionTemplate
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
    using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
@@ -231,13 +231,13 @@ struct UnaryExpressionTemplate
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( operand.getElement( i ) );
+      return Operation{}( operand.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( operand[ i ] );
+      return Operation{}( operand[ i ] );
    }
 
    __cuda_callable__
@@ -263,50 +263,86 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
+#define TNL_MAKE_UNARY_EXPRESSION(fname, functor)                                \
+   template< typename ET1,                                                       \
+             typename..., EnableIfUnaryExpression_t< ET1, bool > = true >        \
+   auto                                                                          \
+   fname( const ET1& a )                                                         \
+   {                                                                             \
+      return UnaryExpressionTemplate< ET1, functor >( a );                       \
+   }                                                                             \
+
+#define TNL_MAKE_BINARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1, typename ET2,                                         \
+             typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >  \
+   auto                                                                          \
+   fname( const ET1& a, const ET2& b )                                           \
+   {                                                                             \
+      return BinaryExpressionTemplate< ET1, ET2, functor >( a, b );              \
+   }                                                                             \
+
+TNL_MAKE_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_UNARY_EXPRESSION
+#undef TNL_MAKE_BINARY_EXPRESSION
 
 ////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
-
-////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return BinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return BinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return BinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return UnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator==( const ET1& a, const ET2& b )
 {
@@ -316,7 +352,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator!=( const ET1& a, const ET2& b )
 {
@@ -326,7 +362,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<( const ET1& a, const ET2& b )
 {
@@ -336,7 +372,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<=( const ET1& a, const ET2& b )
 {
@@ -346,7 +382,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>( const ET1& a, const ET2& b )
 {
@@ -356,7 +392,7 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>=( const ET1& a, const ET2& b )
 {
@@ -366,7 +402,7 @@ operator>=( const ET1& a, const ET2& b )
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 operator,( const ET1& a, const ET2& b )
 {
@@ -374,301 +410,17 @@ operator,( const ET1& a, const ET2& b )
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 dot( const ET1& a, const ET2& b )
 {
    return (a, b);
 }
 
-////
-// Unary expression plus
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-operator+( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, UnaryPlus >( a );
-}
-
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-operator-( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, UnaryMinus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-min( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-max( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-abs( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return BinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-exp( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sqrt( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cbrt( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log10( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log2( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sin( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cos( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-tan( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-asin( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-acos( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-atan( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sinh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cosh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-tanh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-asinh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-acosh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-atanh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-floor( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-ceil( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sign( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation,
-          typename = void >
-auto
-cast( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 min( const ET1& a )
 {
@@ -676,7 +428,7 @@ min( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 argMin( const ET1& a )
 {
@@ -684,7 +436,7 @@ argMin( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 max( const ET1& a )
 {
@@ -692,7 +444,7 @@ max( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 argMax( const ET1& a )
 {
@@ -700,7 +452,7 @@ argMax( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 sum( const ET1& a )
 {
@@ -708,7 +460,7 @@ sum( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 maxNorm( const ET1& a )
 {
@@ -716,7 +468,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 l1Norm( const ET1& a )
 {
@@ -724,7 +476,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 l2Norm( const ET1& a )
 {
@@ -734,7 +486,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 lpNorm( const ET1& a, const Real& p )
 // since (1.0 / p) has type double, TNL::pow returns double
@@ -749,7 +501,7 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 product( const ET1& a )
 {
@@ -757,7 +509,7 @@ product( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 logicalAnd( const ET1& a )
 {
@@ -765,7 +517,7 @@ logicalAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 logicalOr( const ET1& a )
 {
@@ -773,7 +525,7 @@ logicalOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 binaryAnd( const ET1& a )
 {
@@ -781,13 +533,21 @@ binaryAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 binaryOr( const ET1& a )
 {
    return Algorithms::reduce( a, TNL::BitOr{} );
 }
 
+template< typename ET1,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
+auto
+binaryXor( const ET1& a )
+{
+   return Algorithms::reduce( a, TNL::BitXor{} );
+}
+
 #endif // DOXYGEN_ONLY
 
 ////
@@ -823,6 +583,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/HorizontalOperations.h b/src/TNL/Containers/Expressions/HorizontalOperations.h
deleted file mode 100644
index 8000243d8..000000000
--- a/src/TNL/Containers/Expressions/HorizontalOperations.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/***************************************************************************
-                          HorizontalOperations.h  -  description
-                             -------------------
-    begin                : Apr 18, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Math.h>
-
-namespace TNL {
-namespace Containers {
-namespace Expressions {
-
-struct Addition
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a + b )
-   {
-      return a + b;
-   }
-};
-
-struct Subtraction
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a - b )
-   {
-      return a - b;
-   }
-};
-
-struct Multiplication
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a * b )
-   {
-      return a * b;
-   }
-};
-
-struct Division
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a / b )
-   {
-      return a / b;
-   }
-};
-
-struct Min
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( min( a, b ) )
-   {
-      return min( a, b );
-   }
-};
-
-struct Max
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( max( a, b ) )
-   {
-      return max( a, b );
-   }
-};
-
-struct UnaryPlus
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( +a )
-   {
-      return +a;
-   }
-};
-
-struct UnaryMinus
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( -a )
-   {
-      return -a;
-   }
-};
-
-struct Abs
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( abs( a ) )
-   {
-      return abs( a );
-   }
-};
-
-struct Pow
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& exp ) -> decltype( pow( a, exp ) )
-   {
-      return pow( a, exp );
-   }
-};
-
-struct Exp
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( exp( a ) )
-   {
-      return exp( a );
-   }
-};
-
-struct Sqrt
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sqrt( a ) )
-   {
-      return sqrt( a );
-   }
-};
-
-struct Cbrt
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cbrt( a ) )
-   {
-      return cbrt( a );
-   }
-};
-
-struct Log
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log( a ) )
-   {
-      return log( a );
-   }
-};
-
-struct Log10
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log10( a ) )
-   {
-      return log10( a );
-   }
-};
-
-struct Log2
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log2( a ) )
-   {
-      return log2( a );
-   }
-};
-
-struct Sin
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sin( a ) )
-   {
-      return sin( a );
-   }
-};
-
-struct Cos
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cos( a ) )
-   {
-      return cos( a );
-   }
-};
-
-struct Tan
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( tan( a ) )
-   {
-      return tan( a );
-   }
-};
-
-struct Asin
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( asin( a ) )
-   {
-      return asin( a );
-   }
-};
-
-struct Acos
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( acos( a ) )
-   {
-      return acos( a );
-   }
-};
-
-struct Atan
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( atan( a ) )
-   {
-      return atan( a );
-   }
-};
-
-struct Sinh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sinh( a ) )
-   {
-      return sinh( a );
-   }
-};
-
-struct Cosh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cosh( a ) )
-   {
-      return cosh( a );
-   }
-};
-
-struct Tanh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( tanh( a ) )
-   {
-      return tanh( a );
-   }
-};
-
-struct Asinh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( asinh( a ) )
-   {
-      return asinh( a );
-   }
-};
-
-struct Acosh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( acosh( a ) )
-   {
-      return acosh( a );
-   }
-};
-
-struct Atanh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( atanh( a ) )
-   {
-      return atanh( a );
-   }
-};
-
-struct Floor
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( floor( a ) )
-   {
-      return floor( a );
-   }
-};
-
-struct Ceil
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( ceil( a ) )
-   {
-      return ceil( a );
-   }
-};
-
-struct Sign
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sign( a ) )
-   {
-      return sign( a );
-   }
-};
-
-template< typename ResultType >
-struct Cast
-{
-   struct Operation
-   {
-      template< typename T1 >
-      __cuda_callable__
-      static auto evaluate( const T1& a ) -> ResultType
-      {
-         return static_cast<ResultType>( a );
-      }
-   };
-};
-
-} // namespace Expressions
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index 006a4a36a..102656e05 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -13,10 +13,10 @@
 #include <ostream>
 #include <utility>
 
+#include <TNL/Functional.h>
 #include <TNL/TypeTraits.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Containers/Expressions/HorizontalOperations.h>
 #include <TNL/Containers/Expressions/StaticComparison.h>
 #include <TNL/Containers/Expressions/StaticVerticalOperations.h>
 
@@ -59,7 +59,7 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
    using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
@@ -82,7 +82,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1[ i ], op2[ i ] );
+      return Operation{}( op1[ i ], op2[ i ] );
    }
 
    __cuda_callable__
@@ -114,7 +114,7 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable  >
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
    using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
@@ -131,7 +131,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1[ i ], op2 );
+      return Operation{}( op1[ i ], op2 );
    }
 
    __cuda_callable__
@@ -163,7 +163,7 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable  >
 {
    using VectorOperandType = T2;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
    using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T2 >::value,
@@ -180,7 +180,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, Ve
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1, op2[ i ] );
+      return Operation{}( op1, op2[ i ] );
    }
 
    __cuda_callable__
@@ -213,7 +213,7 @@ template< typename T1,
 struct StaticUnaryExpressionTemplate
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
    using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
@@ -230,7 +230,7 @@ struct StaticUnaryExpressionTemplate
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( operand[ i ] );
+      return Operation{}( operand[ i ] );
    }
 
    __cuda_callable__
@@ -257,54 +257,90 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
+#define TNL_MAKE_STATIC_UNARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1,                                                             \
+             typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >        \
+   __cuda_callable__                                                                   \
+   auto                                                                                \
+   fname( const ET1& a )                                                               \
+   {                                                                                   \
+      return StaticUnaryExpressionTemplate< ET1, functor >( a );                       \
+   }                                                                                   \
+
+#define TNL_MAKE_STATIC_BINARY_EXPRESSION(fname, functor)                              \
+   template< typename ET1, typename ET2,                                               \
+             typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >  \
+   __cuda_callable__                                                                   \
+   auto                                                                                \
+   fname( const ET1& a, const ET2& b )                                                 \
+   {                                                                                   \
+      return StaticBinaryExpressionTemplate< ET1, ET2, functor >( a, b );              \
+   }                                                                                   \
+
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_STATIC_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_STATIC_UNARY_EXPRESSION
+#undef TNL_MAKE_STATIC_BINARY_EXPRESSION
 
 ////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
-
-////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return StaticBinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return StaticBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return StaticBinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return StaticUnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator==( const ET1& a, const ET2& b )
@@ -315,7 +351,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator!=( const ET1& a, const ET2& b )
@@ -326,7 +362,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator<( const ET1& a, const ET2& b )
@@ -337,7 +373,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator<=( const ET1& a, const ET2& b )
@@ -348,7 +384,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator>( const ET1& a, const ET2& b )
@@ -359,18 +395,18 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator>=( const ET1& a, const ET2& b )
 {
-   return Expressions::StaticComparison< ET1, ET2 >::GE( a, b );
+   return StaticComparison< ET1, ET2 >::GE( a, b );
 }
 
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 auto
 operator,( const ET1& a, const ET2& b )
@@ -379,7 +415,7 @@ operator,( const ET1& a, const ET2& b )
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 auto
 dot( const ET1& a, const ET2& b )
@@ -387,321 +423,10 @@ dot( const ET1& a, const ET2& b )
    return (a, b);
 }
 
-////
-// Unary expression plus
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-operator+( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, UnaryPlus >( a );
-}
-
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-operator-( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, UnaryMinus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-min( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-max( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-abs( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return StaticBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-exp( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sqrt( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cbrt( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log10( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log2( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sin( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cos( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-tan( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-asin( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-acos( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-atan( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sinh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cosh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-tanh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-asinh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-acosh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-atanh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-floor( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-ceil( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sign( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation >
-__cuda_callable__
-auto
-cast( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 min( const ET1& a )
@@ -710,7 +435,7 @@ min( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 argMin( const ET1& a )
@@ -719,7 +444,7 @@ argMin( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 max( const ET1& a )
@@ -728,7 +453,7 @@ max( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 argMax( const ET1& a )
@@ -737,7 +462,7 @@ argMax( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 sum( const ET1& a )
@@ -746,7 +471,7 @@ sum( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 maxNorm( const ET1& a )
@@ -755,7 +480,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 l1Norm( const ET1& a )
@@ -764,7 +489,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< (ET1::getSize() > 1), bool > = true >
 __cuda_callable__
 auto
@@ -775,7 +500,7 @@ l2Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< ET1::getSize() == 1, bool > = true >
 __cuda_callable__
 auto
@@ -787,7 +512,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< (ET1::getSize() > 1), bool > = true >
 __cuda_callable__
 auto
@@ -806,7 +531,7 @@ lpNorm( const ET1& a, const Real& p )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< ET1::getSize() == 1, bool > = true >
 __cuda_callable__
 auto
@@ -817,7 +542,7 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 product( const ET1& a )
@@ -826,7 +551,7 @@ product( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 logicalAnd( const ET1& a )
@@ -835,7 +560,7 @@ logicalAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 logicalOr( const ET1& a )
@@ -844,7 +569,7 @@ logicalOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 binaryAnd( const ET1& a )
@@ -853,7 +578,7 @@ binaryAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 binaryOr( const ET1& a )
@@ -861,6 +586,15 @@ binaryOr( const ET1& a )
    return StaticExpressionBinaryOr( a );
 }
 
+template< typename ET1,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
+__cuda_callable__
+auto
+binaryXor( const ET1& a )
+{
+   return StaticExpressionBinaryXor( a );
+}
+
 #endif // DOXYGEN_ONLY
 
 ////
@@ -896,6 +630,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/StaticVerticalOperations.h b/src/TNL/Containers/Expressions/StaticVerticalOperations.h
index b95883003..fac7cf244 100644
--- a/src/TNL/Containers/Expressions/StaticVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/StaticVerticalOperations.h
@@ -145,6 +145,16 @@ auto StaticExpressionBinaryOr( const Expression& expression )
    return aux;
 }
 
+template< typename Expression >
+__cuda_callable__
+auto StaticExpressionBinaryXor( const Expression& expression )
+{
+   auto aux = expression[ 0 ];
+   for( int i = 1; i < expression.getSize(); i++ )
+      aux = aux ^ expression[ i ];
+   return aux;
+}
+
 } // namespace Expressions
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/TypeTraits.h b/src/TNL/Containers/Expressions/TypeTraits.h
index 943d61470..9af3ef818 100644
--- a/src/TNL/Containers/Expressions/TypeTraits.h
+++ b/src/TNL/Containers/Expressions/TypeTraits.h
@@ -31,11 +31,12 @@ struct HasEnabledDistributedExpressionTemplates : std::false_type
 
 
 // type aliases for enabling specific operators and functions using SFINAE
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfStaticUnaryExpression_t = std::enable_if_t<
-      HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfStaticBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value ||
@@ -46,13 +47,15 @@ using EnableIfStaticBinaryExpression_t = std::enable_if_t<
          HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfUnaryExpression_t = std::enable_if_t<
-      HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfBinaryExpression_t = std::enable_if_t<
       // we need to avoid ambiguity with operators defined in Array (e.g. Array::operator==)
       // so the first operand must not be Array
@@ -64,13 +67,15 @@ using EnableIfBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfDistributedUnaryExpression_t = std::enable_if_t<
-      HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfDistributedBinaryExpression_t = std::enable_if_t<
       // we need to avoid ambiguity with operators defined in Array (e.g. Array::operator==)
       // so the first operand must not be Array
@@ -82,7 +87,8 @@ using EnableIfDistributedBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
 
 // helper trait class for recursively turning expression template classes into compatible vectors
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 793609c53..e39276484 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -18,28 +18,160 @@
 namespace TNL {
 
 /**
- * \brief Extension of \ref std::plus<void> for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `x + y`.
  */
 struct Plus : public std::plus< void >
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity() { return 0; }
 };
 
 /**
- * \brief Extension of \ref std::multiplies<void> for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `x - y`.
+ */
+using Minus = std::minus< void >;
+
+/**
+ * \brief Function object implementing `x * y`.
  */
 struct Multiplies : public std::multiplies< void >
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity() { return 1; }
 };
 
 /**
- * \brief Function object implementing `min(x, y)` for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `x / y`.
+ */
+using Divides = std::divides< void >;
+
+/**
+ * \brief Function object implementing `x % y`.
+ */
+using Modulus = std::modulus< void >;
+
+/**
+ * \brief Function object implementing `+x`.
+ */
+struct UnaryPlus
+{
+   template< typename T >
+   constexpr auto operator()( const T& x ) const -> decltype( +x )
+   {
+      return +x;
+   }
+};
+
+/**
+ * \brief Function object implementing `-x`.
+ */
+using UnaryMinus = std::negate< void >;
+
+/**
+ * \brief Function object implementing `x && y`.
+ */
+struct LogicalAnd : public std::logical_and< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
+};
+
+/**
+ * \brief Function object implementing `x || y`.
+ */
+struct LogicalOr : public std::logical_or< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Function object implementing `!x`.
+ */
+using LogicalNot = std::logical_not< void >;
+
+/**
+ * \brief Extension of \ref std::bit_and<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitAnd : public std::bit_and< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return ~static_cast< T >( 0 ); }
+};
+
+/**
+ * \brief Extension of \ref std::bit_or<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitOr : public std::bit_or< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Extension of \ref std::bit_xor<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitXor : public std::bit_xor< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Function object implementing `~x`.
+ */
+using BitNot = std::bit_not< void >;
+
+/**
+ * \brief Function object implementing `min(x, y)`.
  */
 struct Min
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity()
    {
@@ -48,8 +180,8 @@ struct Min
       return std::numeric_limits< T >::max();
    }
 
-   template< typename Value >
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const
+   template< typename T1, typename T2 >
+   constexpr auto operator()( const T1& lhs, const T2& rhs ) const
    {
       // use argument-dependent lookup and make TNL::min available for unqualified calls
       using TNL::min;
@@ -58,10 +190,15 @@ struct Min
 };
 
 /**
- * \brief Function object implementing `max(x, y)` for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `max(x, y)`.
  */
 struct Max
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity()
    {
@@ -70,8 +207,8 @@ struct Max
       return std::numeric_limits< T >::lowest();
    }
 
-   template< typename Value >
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const
+   template< typename T1, typename T2 >
+   constexpr auto operator()( const T1& lhs, const T2& rhs ) const
    {
       // use argument-dependent lookup and make TNL::max available for unqualified calls
       using TNL::max;
@@ -84,6 +221,11 @@ struct Max
  */
 struct MinWithArg
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity()
    {
@@ -112,6 +254,11 @@ struct MinWithArg
  */
 struct MaxWithArg
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
    static constexpr T getIdentity()
    {
@@ -135,45 +282,68 @@ struct MaxWithArg
    }
 };
 
-/**
- * \brief Extension of \ref std::logical_and<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct LogicalAnd : public std::logical_and< void >
-{
-   template< typename T >
-   static constexpr T getIdentity()
-   {
-      static_assert( std::numeric_limits< T >::is_specialized,
-                     "std::numeric_limits is not specialized for the requested type" );
-      return std::numeric_limits< T >::max();
-   }
-};
+#define TNL_MAKE_UNARY_FUNCTIONAL(name, function)                       \
+   struct name                                                          \
+   {                                                                    \
+      template< typename T >                                            \
+      __cuda_callable__                                                 \
+      auto operator()( const T& x ) const -> decltype( function( x ) )  \
+      {                                                                 \
+         return function( x );                                          \
+      }                                                                 \
+   };                                                                   \
 
-/**
- * \brief Extension of \ref std::logical_or<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct LogicalOr : public std::logical_or< void >
-{
-   template< typename T >
-   static constexpr T getIdentity() { return 0; }
-};
+#define TNL_MAKE_BINARY_FUNCTIONAL(name, function)                                  \
+   struct name                                                                      \
+   {                                                                                \
+      template< typename T1, typename T2 >                                          \
+      __cuda_callable__                                                             \
+      auto operator()( const T1& x, const T2& y ) const -> decltype( pow( x, y ) )  \
+      {                                                                             \
+         return pow( x, y );                                                        \
+      }                                                                             \
+   };                                                                               \
 
-/**
- * \brief Extension of \ref std::bit_and<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct BitAnd : public std::bit_and< void >
-{
-   template< typename T >
-   static constexpr T getIdentity() { return ~static_cast< T >( 0 ); }
-};
+TNL_MAKE_UNARY_FUNCTIONAL( Abs, abs )
+TNL_MAKE_UNARY_FUNCTIONAL( Exp, exp )
+TNL_MAKE_UNARY_FUNCTIONAL( Sqrt, sqrt )
+TNL_MAKE_UNARY_FUNCTIONAL( Cbrt, cbrt )
+TNL_MAKE_UNARY_FUNCTIONAL( Log, log )
+TNL_MAKE_UNARY_FUNCTIONAL( Log10, log10 )
+TNL_MAKE_UNARY_FUNCTIONAL( Log2, log2 )
+TNL_MAKE_UNARY_FUNCTIONAL( Sin, sin )
+TNL_MAKE_UNARY_FUNCTIONAL( Cos, cos )
+TNL_MAKE_UNARY_FUNCTIONAL( Tan, tan )
+TNL_MAKE_UNARY_FUNCTIONAL( Asin, asin )
+TNL_MAKE_UNARY_FUNCTIONAL( Acos, acos )
+TNL_MAKE_UNARY_FUNCTIONAL( Atan, atan )
+TNL_MAKE_UNARY_FUNCTIONAL( Sinh, sinh )
+TNL_MAKE_UNARY_FUNCTIONAL( Cosh, cosh )
+TNL_MAKE_UNARY_FUNCTIONAL( Tanh, tanh )
+TNL_MAKE_UNARY_FUNCTIONAL( Asinh, asinh )
+TNL_MAKE_UNARY_FUNCTIONAL( Acosh, acosh )
+TNL_MAKE_UNARY_FUNCTIONAL( Atanh, atanh )
+TNL_MAKE_UNARY_FUNCTIONAL( Floor, floor )
+TNL_MAKE_UNARY_FUNCTIONAL( Ceil, ceil )
+TNL_MAKE_UNARY_FUNCTIONAL( Sign, sign )
 
-/**
- * \brief Extension of \ref std::bit_or<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct BitOr : public std::bit_or< void >
+TNL_MAKE_BINARY_FUNCTIONAL( Pow, pow )
+
+#undef TNL_MAKE_UNARY_FUNCTIONAL
+#undef TNL_MAKE_BINARY_FUNCTIONAL
+
+template< typename ResultType >
+struct Cast
 {
-   template< typename T >
-   static constexpr T getIdentity() { return 0; }
+   struct Operation
+   {
+      template< typename T >
+      __cuda_callable__
+      auto operator()( const T& a ) const -> ResultType
+      {
+         return static_cast<ResultType>( a );
+      }
+   };
 };
 
 } // namespace TNL
-- 
GitLab


From d94899f0f6959be89d0eada1449129a461fbdd13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Aug 2021 11:58:36 +0200
Subject: [PATCH 50/52] Simplified cast operator in StaticArray and
 StaticVector

---
 src/TNL/Containers/StaticArray.hpp  | 6 +-----
 src/TNL/Containers/StaticVector.hpp | 6 +-----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index d356c9c3a..507a84a63 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -295,11 +295,7 @@ StaticArray< Size, Value >::
 operator StaticArray< Size, OtherValue >() const
 {
    StaticArray< Size, OtherValue > aux;
-   Algorithms::unrolledFor< int, 0, Size >(
-      [&] ( int i ) mutable {
-         aux[ i ] = (*this)[ i ];
-      }
-   );
+   aux.operator=( *this );
    return aux;
 }
 
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index bb22eba8c..3092642e5 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -99,11 +99,7 @@ StaticVector< Size, Real >::
 operator StaticVector< Size, OtherReal >() const
 {
    StaticVector< Size, OtherReal > aux;
-   Algorithms::unrolledFor< int, 0, Size >(
-      [&] ( int i ) mutable {
-         aux[ i ] = (*this)[ i ];
-      }
-   );
+   aux.operator=( *this );
    return aux;
 }
 
-- 
GitLab


From cfe19eb86c063ab1f381c4a6b4c48aa75f104644 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Aug 2021 12:09:13 +0200
Subject: [PATCH 51/52] Changed runtime exceptions in the comparison of vector
 expressions to static asserts

---
 src/TNL/Containers/Expressions/Comparison.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index 144750eb5..79d1a61d2 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -56,8 +56,8 @@ struct VectorComparison< T1, T2, false >
 {
    static bool EQ( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
 
       if( a.getSize() != b.getSize() )
          return false;
@@ -90,8 +90,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool GT( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -105,8 +105,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool GE( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -120,8 +120,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool LT( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -135,8 +135,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool LE( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
-- 
GitLab


From d9af4a614564ff2b7695b01a52bb46b88c09d6b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 8 Aug 2021 12:01:25 +0200
Subject: [PATCH 52/52] Added modulo assignment operator to StaticVector,
 Vector, DistributedVector and their views

---
 src/TNL/Containers/DistributedVector.h       | 10 ++++
 src/TNL/Containers/DistributedVector.hpp     | 26 ++++++++++
 src/TNL/Containers/DistributedVectorView.h   | 10 ++++
 src/TNL/Containers/DistributedVectorView.hpp | 41 +++++++++++++++
 src/TNL/Containers/StaticVector.h            | 42 ++++++++++------
 src/TNL/Containers/StaticVector.hpp          |  9 ++++
 src/TNL/Containers/Vector.h                  | 12 +++++
 src/TNL/Containers/Vector.hpp                | 14 +++++-
 src/TNL/Containers/VectorView.h              | 13 +++++
 src/TNL/Containers/VectorView.hpp            | 13 ++++-
 src/TNL/Containers/detail/VectorAssignment.h | 53 +++++++++++++++++---
 11 files changed, 220 insertions(+), 23 deletions(-)

diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index beb9840b4..f3b53f56f 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -150,6 +150,11 @@ public:
              typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
    DistributedVector& operator/=( Scalar c );
 
+   template< typename Scalar,
+             typename...,
+             typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
+   DistributedVector& operator%=( Scalar c );
+
    template< typename Vector,
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
@@ -174,6 +179,11 @@ public:
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVector& operator/=( const Vector& vector );
+
+   template< typename Vector,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
+   DistributedVector& operator%=( const Vector& vector );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index 72be20d0a..2af5eab2c 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -184,6 +184,19 @@ operator/=( const Vector& vector )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Vector, typename..., typename >
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
+operator%=( const Vector& vector )
+{
+   getView() %= vector;
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -249,5 +262,18 @@ operator/=( Scalar c )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Scalar, typename..., typename >
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
+operator%=( Scalar c )
+{
+   getView() %= c;
+   return *this;
+}
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 4ceef1e7d..ee9bb287f 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -121,6 +121,11 @@ public:
              typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
    DistributedVectorView& operator/=( Scalar c );
 
+   template< typename Scalar,
+             typename...,
+             typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
+   DistributedVectorView& operator%=( Scalar c );
+
    template< typename Vector,
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
@@ -145,6 +150,11 @@ public:
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVectorView& operator/=( const Vector& vector );
+
+   template< typename Vector,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
+   DistributedVectorView& operator%=( const Vector& vector );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 181270a35..69ad4c74b 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -212,6 +212,32 @@ operator/=( const Vector& vector )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Vector, typename..., typename >
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
+operator%=( const Vector& vector )
+{
+   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
+                  "Vector sizes must be equal." );
+   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
+                  "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
+                  "Multiary operations are supported only on vectors within the same communication group." );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() %= vector.getConstLocalViewWithGhosts();
+   }
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -287,5 +313,20 @@ operator/=( Scalar c )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Scalar, typename..., typename >
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
+operator%=( Scalar c )
+{
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      getLocalView() %= c;
+      this->startSynchronization();
+   }
+   return *this;
+}
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/StaticVector.h b/src/TNL/Containers/StaticVector.h
index 13cdd0fbc..da310a1ca 100644
--- a/src/TNL/Containers/StaticVector.h
+++ b/src/TNL/Containers/StaticVector.h
@@ -70,7 +70,7 @@ public:
 
    /**
     * \brief Constructor from binary vector expression.
-    * 
+    *
     * \param expr is binary expression.
     */
    template< typename T1,
@@ -81,7 +81,7 @@ public:
 
    /**
     * \brief Constructor from unary expression.
-    * 
+    *
     * \param expr is unary expression
     */
    template< typename T,
@@ -100,9 +100,9 @@ public:
 
    /**
     * \brief Assignment operator with a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -112,9 +112,9 @@ public:
 
    /**
     * \brief Addition operator with a vector expression
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -124,9 +124,9 @@ public:
 
    /**
     * \brief Subtraction operator with a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -138,7 +138,7 @@ public:
     * \brief Elementwise multiplication by a vector expression.
     *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression.
     * \return reference to this vector
     */
@@ -148,9 +148,9 @@ public:
 
    /**
     * \brief Elementwise division by a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -158,15 +158,27 @@ public:
    __cuda_callable__
    StaticVector& operator/=( const VectorExpression& expression );
 
+   /**
+    * \brief Elementwise modulo by a vector expression.
+    *
+    * The vector expression can be even just static vector.
+    *
+    * \param expression is the vector expression
+    * \return reference to this vector
+    */
+   template< typename VectorExpression >
+   __cuda_callable__
+   StaticVector& operator%=( const VectorExpression& expression );
+
    /**
     * \brief Cast operator for changing of the \e Value type.
-    * 
+    *
     * Returns static array having \e ValueType set to \e OtherValue, i.e.
     * StaticArray< Size, OtherValue >.
-    * 
-    * \tparam OtherValue is the \e Value type of the static array the casting 
+    *
+    * \tparam OtherValue is the \e Value type of the static array the casting
     * will be performed to.
-    * 
+    *
     * \return instance of StaticVector< Size, OtherValue >
     */
    template< typename OtherReal >
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index 3092642e5..b995dc11a 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -92,6 +92,15 @@ StaticVector< Size, Real >& StaticVector< Size, Real >::operator/=( const Vector
    return *this;
 }
 
+template< int Size, typename Real >
+   template< typename VectorExpression >
+__cuda_callable__
+StaticVector< Size, Real >& StaticVector< Size, Real >::operator%=( const VectorExpression& expression )
+{
+   detail::VectorAssignmentWithOperation< StaticVector, VectorExpression >::moduloStatic( *this, expression );
+   return *this;
+}
+
 template< int Size, typename Real >
    template< typename OtherReal >
 __cuda_callable__
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 0bb991a67..b6708563a 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -256,6 +256,18 @@ public:
     */
    template< typename VectorExpression >
    Vector& operator/=( const VectorExpression& expression );
+
+   /**
+    * \brief Modulo assignment operator for vector and a vector expression.
+    *
+    * The division is evaluated element-wise. The vector expression must
+    * either evaluate to a scalar or a vector of the same size as this vector.
+    *
+    * \param expression Reference to a vector expression.
+    * \return Reference to this vector.
+    */
+   template< typename VectorExpression >
+   Vector& operator%=( const VectorExpression& expression );
 };
 
 // Enable expression templates for Vector
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index f204368de..6ab91cd3e 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -11,7 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
 namespace Containers {
@@ -153,5 +152,18 @@ operator/=( const VectorExpression& expression )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename VectorExpression >
+Vector< Real, Device, Index, Allocator >&
+Vector< Real, Device, Index, Allocator >::
+operator%=( const VectorExpression& expression )
+{
+   detail::VectorAssignmentWithOperation< Vector, VectorExpression >::modulo( *this, expression );
+   return *this;
+}
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 90f98028f..04d23b065 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -213,6 +213,19 @@ public:
     */
    template< typename VectorExpression >
    VectorView& operator/=( const VectorExpression& expression );
+
+   /**
+    * \brief Modulo assignment operator for vector view and a vector expression.
+    *
+    * The division is evaluated element-wise. The vector expression must
+    * either evaluate to a scalar or a vector of the same size as this vector
+    * view.
+    *
+    * \param expression Reference to a vector expression.
+    * \return Reference to this vector.
+    */
+   template< typename VectorExpression >
+   VectorView& operator%=( const VectorExpression& expression );
 };
 
 // Enable expression templates for VectorView
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 034f362dc..0d7d13b65 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -12,7 +12,6 @@
 
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Containers/detail/VectorAssignment.h>
-#include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
 namespace Containers {
@@ -102,5 +101,17 @@ operator/=( const VectorExpression& expression )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename VectorExpression >
+VectorView< Real, Device, Index >&
+VectorView< Real, Device, Index >::
+operator%=( const VectorExpression& expression )
+{
+   detail::VectorAssignmentWithOperation< VectorView, VectorExpression >::modulo( *this, expression );
+   return *this;
+}
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/detail/VectorAssignment.h b/src/TNL/Containers/detail/VectorAssignment.h
index 5a36d971c..5c3815a9c 100644
--- a/src/TNL/Containers/detail/VectorAssignment.h
+++ b/src/TNL/Containers/detail/VectorAssignment.h
@@ -27,7 +27,7 @@ template< typename Vector,
 struct VectorAssignment;
 
 /**
- * \brief Vector assignment with an operation: +=, -=, *=, /=
+ * \brief Vector assignment with an operation: +=, -=, *=, /=, %=
  */
 template< typename Vector,
           typename T,
@@ -87,7 +87,6 @@ struct VectorAssignment< Vector, T, false >
    __cuda_callable__
    static void assignStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] = t;
    }
@@ -246,6 +245,31 @@ struct VectorAssignmentWithOperation< Vector, T, true, false >
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
    }
+
+   __cuda_callable__
+   static void moduloStatic( Vector& v, const T& t )
+   {
+      TNL_ASSERT_EQ( v.getSize(), t.getSize(), "The sizes of the vectors must be equal." );
+      for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
+         v[ i ] %= t[ i ];
+   }
+
+   static void modulo( Vector& v, const T& t )
+   {
+      static_assert( std::is_same< typename Vector::DeviceType, typename T::DeviceType >::value,
+                     "Cannot assign an expression to a vector allocated on a different device." );
+      TNL_ASSERT_EQ( v.getSize(), t.getSize(), "The sizes of the vectors must be equal." );
+      using RealType = typename Vector::RealType;
+      using DeviceType = typename Vector::DeviceType;
+      using IndexType = typename Vector::IndexType;
+
+      RealType* data = v.getData();
+      auto divide = [=] __cuda_callable__ ( IndexType i )
+      {
+         data[ i ] %= t[ i ];
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
+   }
 };
 
 /**
@@ -259,7 +283,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void additionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] += t;
    }
@@ -281,7 +304,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void subtractionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] -= t;
    }
@@ -303,7 +325,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void multiplicationStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] *= t;
    }
@@ -325,7 +346,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void divisionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] /= t;
    }
@@ -343,6 +363,27 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
    }
+
+   __cuda_callable__
+   static void moduloStatic( Vector& v, const T& t )
+   {
+      for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
+         v[ i ] %= t;
+   }
+
+   static void modulo( Vector& v, const T& t )
+   {
+      using RealType = typename Vector::RealType;
+      using DeviceType = typename Vector::DeviceType;
+      using IndexType = typename Vector::IndexType;
+
+      RealType* data = v.getData();
+      auto divide = [=] __cuda_callable__ ( IndexType i )
+      {
+         data[ i ] %= t;
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
+   }
 };
 
 } // namespace detail
-- 
GitLab