diff --git a/Documentation/Examples/Algorithms/CMakeLists.txt b/Documentation/Examples/Algorithms/CMakeLists.txt
index 5ffb91b16855d23531ad5c06dba8a66cdefd190c..982b9c06f4901ec57240ca5affe6c5cdc81512e4 100644
--- a/Documentation/Examples/Algorithms/CMakeLists.txt
+++ b/Documentation/Examples/Algorithms/CMakeLists.txt
@@ -1,21 +1,39 @@
 IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
    CUDA_ADD_EXECUTABLE( SortingExampleCuda SortingExample.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+
    CUDA_ADD_EXECUTABLE( SortingExample2Cuda SortingExample2.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample2Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+
    CUDA_ADD_EXECUTABLE( SortingExample3Cuda SortingExample3.cu)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample3Cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
+
+   CUDA_ADD_EXECUTABLE(ParallelForExampleCuda ParallelForExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+
+   CUDA_ADD_EXECUTABLE(reduceArrayExampleCuda reduceArrayExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
+
+   CUDA_ADD_EXECUTABLE(reduceWithArgumentArrayExampleCuda reduceWithArgumentArrayExample.cu)
+   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExampleCuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
 ELSE()
-   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
-   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
    ADD_EXECUTABLE( SortingExample SortingExample.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample.out OUTPUT SortingExample.out )
+
    ADD_EXECUTABLE( SortingExample2 SortingExample2.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample2.out OUTPUT SortingExample2.out )
+
    ADD_EXECUTABLE( SortingExample3 SortingExample3.cpp)
    ADD_CUSTOM_COMMAND( COMMAND SortingExample3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SortingExample3.out OUTPUT SortingExample3.out )
+
+   ADD_EXECUTABLE(ParallelForExample ParallelForExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND ParallelForExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ParallelForExample.out OUTPUT ParallelForExample.out )
+
+   ADD_EXECUTABLE(reduceArrayExample reduceArrayExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND reduceArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceArrayExample.out OUTPUT reduceArrayExample.out )
+
+   ADD_EXECUTABLE(reduceWithArgumentArrayExample reduceWithArgumentArrayExample.cpp)
+   ADD_CUSTOM_COMMAND( COMMAND reduceWithArgumentArrayExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/reduceWithArgumentArrayExample.out OUTPUT reduceWithArgumentArrayExample.out )
 ENDIF()
 
 ADD_EXECUTABLE(staticForExample staticForExample.cpp)
@@ -29,6 +47,8 @@ ADD_CUSTOM_TARGET( RunAlgorithmsExamples ALL DEPENDS
    SortingExample2.out
    SortingExample3.out
    ParallelForExample.out
+   reduceArrayExample.out
+   reduceWithArgumentArrayExample.out
    unrolledForExample.out
    staticForExample.out
 )
diff --git a/Documentation/Examples/Containers/ArrayExample_reduceElements.cpp b/Documentation/Examples/Algorithms/reduceArrayExample.cpp
similarity index 58%
rename from Documentation/Examples/Containers/ArrayExample_reduceElements.cpp
rename to Documentation/Examples/Algorithms/reduceArrayExample.cpp
index bdf9437321ce7b7f2ba5fae698259fa4e2de4c2e..5af0a243670df8f7ec2e89f302acaa947b31d7a5 100644
--- a/Documentation/Examples/Containers/ArrayExample_reduceElements.cpp
+++ b/Documentation/Examples/Algorithms/reduceArrayExample.cpp
@@ -1,12 +1,10 @@
-#include <iostream>
-#include <functional>
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 
 template< typename Device >
-void reduceElementsExample()
+void reduceArrayExample()
 {
    /****
     * Create new arrays
@@ -22,23 +20,28 @@ void reduceElementsExample()
    /****
     * Sum all elements of array `a`
     */
-   auto fetch = [=] __cuda_callable__ ( int i, float& value ) { return value; };
-   auto sum = a.reduceEachElement( fetch, std::plus<>{}, 0.0 );
+   float sum_total = Algorithms::reduce( a, TNL::Plus{} );
+
+   /****
+    * Sum last 5 elements of array `a`
+    */
+   float sum_last_five = Algorithms::reduce( a.getConstView( 5, 10 ), TNL::Plus{} );
 
    /****
     * Print the results
     */
    std::cout << " a = " << a << std::endl;
-   std::cout << " sum = " << sum << std::endl;
+   std::cout << " sum of all elements = " << sum_total << std::endl;
+   std::cout << " sum of last 5 elements = " << sum_last_five << std::endl;
 }
 
 int main( int argc, char* argv[] )
 {
    std::cout << "Running example on the host system: " << std::endl;
-   reduceElementsExample< Devices::Host >();
+   reduceArrayExample< Devices::Host >();
 
 #ifdef HAVE_CUDA
    std::cout << "Running example on the CUDA device: " << std::endl;
-   reduceElementsExample< Devices::Cuda >();
+   reduceArrayExample< Devices::Cuda >();
 #endif
 }
diff --git a/Documentation/Examples/Algorithms/reduceArrayExample.cu b/Documentation/Examples/Algorithms/reduceArrayExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..87a4a231064e78c70ec8eed00e89ac46a11fbc8a
--- /dev/null
+++ b/Documentation/Examples/Algorithms/reduceArrayExample.cu
@@ -0,0 +1 @@
+reduceArrayExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c29764ad38da7fd06dfeabcd8a549bc9bcd37a83
--- /dev/null
+++ b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cpp
@@ -0,0 +1,41 @@
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/reduce.h>
+
+using namespace TNL;
+
+template< typename Device >
+void reduceArrayExample()
+{
+   /****
+    * Create new arrays
+    */
+   const int size = 10;
+   Containers::Vector< float, Device > a( size );
+
+   /****
+    * Initiate the elements of array `a`
+    */
+   a.forAllElements( [] __cuda_callable__ ( int i, float& value ) { value = 3 - i; } );
+
+   /****
+    * Reduce all elements of array `a`
+    */
+   std::pair< float, int > result_total = Algorithms::reduceWithArgument( TNL::abs( a ), TNL::MaxWithArg{} );
+
+   /****
+    * Print the results
+    */
+   std::cout << " a = " << a << std::endl;
+   std::cout << " abs-max of all elements = " << result_total.first << " at position " << result_total.second << std::endl;
+}
+
+int main( int argc, char* argv[] )
+{
+   std::cout << "Running example on the host system: " << std::endl;
+   reduceArrayExample< Devices::Host >();
+
+#ifdef HAVE_CUDA
+   std::cout << "Running example on the CUDA device: " << std::endl;
+   reduceArrayExample< Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..d5721a03ad9aaa5866bcd8eb45ced9d62576208f
--- /dev/null
+++ b/Documentation/Examples/Algorithms/reduceWithArgumentArrayExample.cu
@@ -0,0 +1 @@
+reduceWithArgumentArrayExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Containers/ArrayExample.cpp b/Documentation/Examples/Containers/ArrayExample.cpp
index 4d6ca48e5865c340f26660a9c6305e627c880ec7..71945e1df5dae4b5e447914e7ffc3c576c32d3ca 100644
--- a/Documentation/Examples/Containers/ArrayExample.cpp
+++ b/Documentation/Examples/Containers/ArrayExample.cpp
@@ -40,16 +40,6 @@ void arrayExample()
    a1 = v;
    std::cout << "a1 = " << a1 << std::endl;
 
-   /***
-    * Simple array values checks can be done as follows ...
-    */
-   if( a1.containsValue( 1 ) )
-      std::cout << "a1 contains value 1." << std::endl;
-   if( a1.containsValue( size ) )
-      std::cout << "a1 contains value " << size << "." << std::endl;
-   if( a1.containsOnlyValue( 0 ) )
-      std::cout << "a2 contains only value 0." << std::endl;
-
    /***
     * You may swap array data with the swap method.
     */
diff --git a/Documentation/Examples/Containers/ArrayExample_reduceElements.cu b/Documentation/Examples/Containers/ArrayExample_reduceElements.cu
deleted file mode 120000
index 466460f2f8be4e00abbcbd949f88ed7740225288..0000000000000000000000000000000000000000
--- a/Documentation/Examples/Containers/ArrayExample_reduceElements.cu
+++ /dev/null
@@ -1 +0,0 @@
-ArrayExample_reduceElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Containers/ArrayViewExample.cpp b/Documentation/Examples/Containers/ArrayViewExample.cpp
index fdc1897c53bfc11075878672cf7167705b71b255..23207e69f49df232a9cb3ae32b0bd91af2f4bbbd 100644
--- a/Documentation/Examples/Containers/ArrayViewExample.cpp
+++ b/Documentation/Examples/Containers/ArrayViewExample.cpp
@@ -29,16 +29,6 @@ void arrayViewExample()
     */
    a2_view = 0;
 
-   /***
-    * Simple array view values checks can be done as follows ...
-    */
-   if( a1_view.containsValue( 1 ) )
-      std::cout << "a1 contains value 1." << std::endl;
-   if( a1_view.containsValue( size ) )
-      std::cout << "a1 contains value " << size << "." << std::endl;
-   if( a1_view.containsOnlyValue( 0 ) )
-      std::cout << "a2 contains only value 0." << std::endl;
-
    /***
     * More efficient way of array view elements manipulation is with the lambda functions
     */
diff --git a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp b/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp
deleted file mode 100644
index 1357ac8d0acd0bb8df72b077876c214d4d749524..0000000000000000000000000000000000000000
--- a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-#include <iostream>
-#include <functional>
-#include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
-
-using namespace TNL;
-
-template< typename Device >
-void reduceElementsExample()
-{
-   /****
-    * Create new arrays
-    */
-   const int size = 10;
-   Containers::Array< float, Device > a( size );
-   auto a_view = a.getView();
-
-   /****
-    * Initiate the elements of array `a`
-    */
-   a_view.forAllElements( [] __cuda_callable__ ( int i, float& value ) { value = i; } );
-
-   /****
-    * Sum all elements of array `a`
-    */
-   auto fetch = [=] __cuda_callable__ ( int i, float& value ) { return value; };
-   auto sum = a_view.reduceEachElement( fetch, std::plus<>{}, 0.0 );
-
-   /****
-    * Print the results
-    */
-   std::cout << " a = " << a << std::endl;
-   std::cout << " sum = " << sum << std::endl;
-}
-
-int main( int argc, char* argv[] )
-{
-   std::cout << "Running example on the host system: " << std::endl;
-   reduceElementsExample< Devices::Host >();
-
-#ifdef HAVE_CUDA
-   std::cout << "Running example on the CUDA device: " << std::endl;
-   reduceElementsExample< Devices::Cuda >();
-#endif
-}
diff --git a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu b/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu
deleted file mode 120000
index 220efb6f8db654504aecf69ac1397c6d662c7d92..0000000000000000000000000000000000000000
--- a/Documentation/Examples/Containers/ArrayViewExample_reduceElements.cu
+++ /dev/null
@@ -1 +0,0 @@
-ArrayViewExample_reduceElements.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Containers/CMakeLists.txt b/Documentation/Examples/Containers/CMakeLists.txt
index e85546a4516a1d99f9bde21a7c9fa98a554c00d9..eb9f8d30a85b8b6dce8500f10cb928048c107d01 100644
--- a/Documentation/Examples/Containers/CMakeLists.txt
+++ b/Documentation/Examples/Containers/CMakeLists.txt
@@ -1,10 +1,8 @@
 set( COMMON_EXAMPLES
          ArrayExample
          ArrayExample_forElements
-         ArrayExample_reduceElements
          ArrayViewExample
          ArrayViewExample_forElements
-         ArrayViewExample_reduceElements
          VectorExample
 )
 
diff --git a/Documentation/Examples/Containers/DistributedArrayExample.cu b/Documentation/Examples/Containers/DistributedArrayExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..e4e614621043194ece749200d90442eee70c8a1d
--- /dev/null
+++ b/Documentation/Examples/Containers/DistributedArrayExample.cu
@@ -0,0 +1 @@
+DistributedArrayExample.cpp
\ No newline at end of file
diff --git a/Documentation/Examples/Containers/VectorExample.cpp b/Documentation/Examples/Containers/VectorExample.cpp
index a3fdf99d9fe5ddd628ba8656a619651c750e773f..4f126dfb299f2c90efec05250641fad5f1fb472f 100644
--- a/Documentation/Examples/Containers/VectorExample.cpp
+++ b/Documentation/Examples/Containers/VectorExample.cpp
@@ -13,8 +13,6 @@ void VectorExample()
 {
     Containers::Vector< int, Device > vector1( 5 );
     vector1 = 0;
-    cout << "Does vector contain 1?" << vector1.containsValue( 1 ) << endl;
-    cout << "Does vector contain only zeros?" << vector1.containsOnlyValue( 0 ) << endl;
 
     Containers::Vector< int, Device > vector2( 3 );
     vector2 = 1;
diff --git a/Documentation/Tutorials/Arrays/CMakeLists.txt b/Documentation/Tutorials/Arrays/CMakeLists.txt
index cc1f52267566c2bc58aba962aba7a1302869afcc..71facac2fa7dd309494213eaae2348b1a87c1234 100644
--- a/Documentation/Tutorials/Arrays/CMakeLists.txt
+++ b/Documentation/Tutorials/Arrays/CMakeLists.txt
@@ -9,8 +9,8 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_COMMAND( COMMAND ArrayView-2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ArrayView-2.out OUTPUT ArrayView-2.out )
    CUDA_ADD_EXECUTABLE( ArrayViewForElements ArrayViewForElements.cu )
    ADD_CUSTOM_COMMAND( COMMAND ArrayViewForElements > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ArrayViewForElements.out OUTPUT ArrayViewForElements.out )
-   CUDA_ADD_EXECUTABLE( ContainsValue ContainsValue.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ContainsValue > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ContainsValue.out OUTPUT ContainsValue.out )
+   CUDA_ADD_EXECUTABLE( contains contains.cu )
+   ADD_CUSTOM_COMMAND( COMMAND contains > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/contains.out OUTPUT contains.out )
    CUDA_ADD_EXECUTABLE( ElementsAccessing-1 ElementsAccessing-1.cu )
    ADD_CUSTOM_COMMAND( COMMAND ElementsAccessing-1 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ElementsAccessing-1.out OUTPUT ElementsAccessing-1.out )
    CUDA_ADD_EXECUTABLE( ElementsAccessing-2 ElementsAccessing-2.cu )
@@ -25,7 +25,7 @@ ADD_CUSTOM_TARGET( TutorialsArrays-cuda ALL DEPENDS
    ArrayIO.out
    ArrayView-1.out
    ArrayView-2.out
-   ContainsValue.out
+   contains.out
    ElementsAccessing-1.out
    ElementsAccessing-2.out
    ArrayViewForElements.out
diff --git a/Documentation/Tutorials/Arrays/ContainsValue.cu b/Documentation/Tutorials/Arrays/ContainsValue.cu
deleted file mode 120000
index 015d07af1c26dda2b85609551b97833c095f906a..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/Arrays/ContainsValue.cu
+++ /dev/null
@@ -1 +0,0 @@
-ContainsValue.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/ContainsValue.cpp b/Documentation/Tutorials/Arrays/contains.cpp
similarity index 74%
rename from Documentation/Tutorials/Arrays/ContainsValue.cpp
rename to Documentation/Tutorials/Arrays/contains.cpp
index 4b726a7bd8a21c32699effd10bb58046dac976a5..d840c8751b861a7213ffd5cf53d35ead5cf820bc 100644
--- a/Documentation/Tutorials/Arrays/ContainsValue.cpp
+++ b/Documentation/Tutorials/Arrays/contains.cpp
@@ -1,9 +1,10 @@
 #include <iostream>
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/ArrayView.h>
+#include <TNL/Algorithms/contains.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
+using namespace TNL::Algorithms;
 
 int main( int argc, char* argv[] )
 {
@@ -18,35 +19,35 @@ int main( int argc, char* argv[] )
    /****
     * Test the values stored in the arrays
     */
-   if( a.containsValue( 0.0 ) )
+   if( contains( a, 0.0 ) )
       std::cout << "a contains 0" << std::endl;
 
-   if( a.containsValue( 1.0 ) )
+   if( contains( a, 1.0 ) )
       std::cout << "a contains 1" << std::endl;
 
-   if( b.containsValue( 0.0 ) )
+   if( contains( b, 0.0 ) )
       std::cout << "b contains 0" << std::endl;
 
-   if( b.containsValue( 1.0 ) )
+   if( contains( b, 1.0 ) )
       std::cout << "b contains 1" << std::endl;
 
-   if( a.containsOnlyValue( 0.0 ) )
+   if( containsOnlyValue( a, 0.0 ) )
       std::cout << "a contains only 0" << std::endl;
 
-   if( a.containsOnlyValue( 1.0 ) )
+   if( containsOnlyValue( a, 1.0 ) )
       std::cout << "a contains only 1" << std::endl;
 
-   if( b.containsOnlyValue( 0.0 ) )
+   if( containsOnlyValue( b, 0.0 ) )
       std::cout << "b contains only 0" << std::endl;
 
-   if( b.containsOnlyValue( 1.0 ) )
+   if( containsOnlyValue( b, 1.0 ) )
       std::cout << "b contains only 1" << std::endl;
 
    /****
     * Change the first half of b and test it again
     */
    b.forElements( 0, 5, [=] __cuda_callable__ ( int i, float& value ) { value = 0.0; } );
-   if( b.containsOnlyValue( 0.0, 0, 5 ) )
+   if( containsOnlyValue( b, 0.0, 0, 5 ) )
       std::cout << "First five elements of b contains only 0" << std::endl;
 }
 
diff --git a/Documentation/Tutorials/Arrays/contains.cu b/Documentation/Tutorials/Arrays/contains.cu
new file mode 120000
index 0000000000000000000000000000000000000000..6b27a9bc0f999266aa2042cf3a6e34f252cc37fd
--- /dev/null
+++ b/Documentation/Tutorials/Arrays/contains.cu
@@ -0,0 +1 @@
+contains.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/Arrays/tutorial_Arrays.md b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
index ad3016411567666b55ed7eeb375daf7286d3c9f5..a8405e5f06a0dc25d59d9d57e12ff269270b48c1 100644
--- a/Documentation/Tutorials/Arrays/tutorial_Arrays.md
+++ b/Documentation/Tutorials/Arrays/tutorial_Arrays.md
@@ -105,13 +105,17 @@ Output:
 
 ### Checking the array contents
 
-Methods `containsValue` and `containsOnlyValue` serve for testing the contents of the arrays. `containsValue` returns `true` of there is at least one element in the array with given value. `containsOnlyValue` returns `true` only if all elements of the array equal given value. The test can be restricted to subinterval of array elements. Both methods are implemented in `Array` as well as in `ArrayView`. See the following code snippet for example of use.
+The functions \ref TNL::Algorithms::contains and \ref TNL::Algorithms::containsOnlyValue serve for testing the contents of arrays, vectors or their views.
+`contains` returns `true` if there is at least one element in the array with given value.
+`containsOnlyValue` returns `true` only if all elements of the array are equal to the given value.
+The test can be restricted to a subinterval of array elements.
+See the following code snippet for usage example.
 
-\include ContainsValue.cpp
+\include contains.cpp
 
 Output:
 
-\include ContainsValue.out
+\include contains.out
 
 ### IO operations with arrays
 
diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 594ebd8cd53b91eb12a871a5c70787992b9a0fb5..b88328a41b25442013db92834353cf7c4da4c5ef 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -12,8 +12,10 @@ set( COMMON_EXAMPLES
      MapReduceExample-3
      ReductionWithArgument
      ReductionWithArgumentWithFunctional
-     ScanExample
-     ExclusiveScanExample
+     inclusiveScanExample
+     exclusiveScanExample
+     inplaceInclusiveScanExample
+     inplaceExclusiveScanExample
      SegmentedScanExample
 )
 
@@ -46,4 +48,4 @@ IF( BUILD_CUDA )
    ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
 ELSE()
    ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples ALL DEPENDS ${HOST_OUTPUTS} )
-ENDIF()
\ No newline at end of file
+ENDIF()
diff --git a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
index 8972af7f44813a92969cb8ca5e1925002a15274c..3279fa377a170b1cb904a3c2ba8c0c6d62a0d25f 100644
--- a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp
deleted file mode 100644
index 29817aa1427405142a2feb07362f9ad443fa4b39..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-using namespace TNL::Algorithms;
-
-template< typename Device >
-void scan( Vector< double, Device >& v )
-{
-   /***
-    * Reduction is sum of two numbers.
-    */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-
-   /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
-    * where the scan is performed, lambda function which is used by the scan and
-    * zero element (idempotent) of the 'sum' operation.
-    */
-   Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduce, 0.0 );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the exclusive prefix sum with vectors allocated on CPU.
-    */
-   Vector< double, Devices::Host > host_v( 10 );
-   host_v = 1.0;
-   std::cout << "host_v = " << host_v << std::endl;
-   scan( host_v );
-   std::cout << "The exclusive prefix sum of the host vector is " << host_v << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v( 10 );
-   cuda_v = 1.0;
-   std::cout << "cuda_v = " << cuda_v << std::endl;
-   scan( cuda_v );
-   std::cout << "The exclusive prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
-
diff --git a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu
deleted file mode 120000
index 75896ca31b875a9a8c2493b360a30de33af18860..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ReductionAndScan/ExclusiveScanExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ExclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
index ff02f9c86f800b70e45f80690c1eb6b54c37da6f..8d1527aaaa055365732bfc1f1a9e65b4a93b24dc 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
index 065f4608ad1194a1c6867857567a2d67cef37bb7..c0cdb7e211640a2ee87485d6cc8665447b495bfe 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Timer.h>
 
 using namespace TNL;
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
index f3c54f6b0f66a9cbb353021a2858311c405ff1fd..0b93682c14e74286b4d5ba142f36367b23ffc7f4 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Timer.h>
 
 using namespace TNL;
diff --git a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
index c9a5926ad741dd85f971af34374e45549b4b10a3..b79042db65c9b8052d335e17e53ddf9712fe19f4 100644
--- a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
index 389ecd4975f21a1fedbd994fb11c4564c3018b3a..ace350b3978381ee13e6dff0316b7be4ee959a5d 100644
--- a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
index 79a82c7334270e5b4cc3360f09b7d2761bc0f65f..d7dba9594224954afc4a476d85ba14c6cdecfb9d 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
index 7b084db0eaa80805c6c54cdc7beb0828c7907715..e5d24ab43d9da868501680e56b359dd96679efc7 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
index 2dd84aa03e55159ecb130fbf51c004f68c49eb93..a44410185da80c0bc5fa0a5dd369543c0c7d7879 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
index 4838f5f77e98515b1b20845ea1f2ee0e626c9109..df9b30206d7aa686a2deeb7f8d72a62e69de3be9 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
deleted file mode 100644
index 5281bfd5460711944fd0bbcbdf867d679e8e7954..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
-
-using namespace TNL;
-using namespace TNL::Containers;
-using namespace TNL::Algorithms;
-
-template< typename Device >
-void scan( Vector< double, Device >& v )
-{
-   /***
-    * Reduction is sum of two numbers.
-    */
-   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-
-   /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
-    * where the scan is performed, lambda function which is used by the scan and
-    * zero element (idempotent) of the 'sum' operation.
-    */
-   Scan< Device >::perform( v, 0, v.getSize(), reduction, 0.0 );
-}
-
-int main( int argc, char* argv[] )
-{
-   /***
-    * Firstly, test the prefix sum with vectors allocated on CPU.
-    */
-   Vector< double, Devices::Host > host_v( 10 );
-   host_v = 1.0;
-   std::cout << "host_v = " << host_v << std::endl;
-   scan( host_v );
-   std::cout << "The prefix sum of the host vector is " << host_v << "." << std::endl;
-
-   /***
-    * And then also on GPU.
-    */
-#ifdef HAVE_CUDA
-   Vector< double, Devices::Cuda > cuda_v( 10 );
-   cuda_v = 1.0;
-   std::cout << "cuda_v = " << cuda_v << std::endl;
-   scan( cuda_v );
-   std::cout << "The prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
-#endif
-   return EXIT_SUCCESS;
-}
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cu b/Documentation/Tutorials/ReductionAndScan/ScanExample.cu
deleted file mode 120000
index d93679f617cb7ac53f9e1c5f7e5c1d216b1c6c65..0000000000000000000000000000000000000000
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cu
+++ /dev/null
@@ -1 +0,0 @@
-ScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
index 5e1379f5d572007beeee1fdcc6671c1240cc8973..7cfd433542ec0960439d076fcce7706cdacf3937 100644
--- a/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SegmentedScanExample.cpp
@@ -1,13 +1,13 @@
 #include <iostream>
-#include <cstdlib>
-#include <TNL/Containers/Vector.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/SegmentedScan.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
 using namespace TNL::Algorithms;
 
 template< typename Device >
-void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
+void segmentedScan( Array< double, Device >& v, Array< bool, Device >& flags )
 {
    /***
     * Reduction is sum of two numbers.
@@ -15,7 +15,7 @@ void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
    auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
-    * As parameters, we pass vector on which the scan is to be performed, interval
+    * As parameters, we pass array on which the scan is to be performed, interval
     * where the scan is performed, lambda function which is used by the scan and
     * zero element (idempotent) of the 'sum' operation.
     */
@@ -25,25 +25,25 @@ void segmentedScan( Vector< double, Device >& v, Vector< bool, Device >& flags )
 int main( int argc, char* argv[] )
 {
    /***
-    * Firstly, test the segmented prefix sum with vectors allocated on CPU.
+    * Firstly, test the segmented prefix sum with arrays allocated on CPU.
     */
-   Vector< bool, Devices::Host > host_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
-   Vector< double, Devices::Host > host_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
+   Array< bool, Devices::Host > host_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
+   Array< double, Devices::Host > host_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
    std::cout << "host_flags = " << host_flags << std::endl;
    std::cout << "host_v     = " << host_v << std::endl;
    segmentedScan( host_v, host_flags );
-   std::cout << "The segmented prefix sum of the host vector is " << host_v << "." << std::endl;
+   std::cout << "The segmented prefix sum of the host array is " << host_v << "." << std::endl;
 
    /***
     * And then also on GPU.
     */
 #ifdef HAVE_CUDA
-   //Vector< bool, Devices::Cuda > cuda_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
-   //Vector< double, Devices::Cuda > cuda_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
+   //Array< bool, Devices::Cuda > cuda_flags{ 1,0,0,1,0,0,0,1,0,1,0,0, 0, 0 };
+   //Array< double, Devices::Cuda > cuda_v { 1,3,5,2,4,6,9,3,5,3,6,9,12,15 };
    //std::cout << "cuda_flags = " << cuda_flags << std::endl;
    //std::cout << "cuda_v     = " << cuda_v << std::endl;
    //segmentedScan( cuda_v, cuda_flags );
-   //std::cout << "The segmnted prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
+   //std::cout << "The segmnted prefix sum of the CUDA array is " << cuda_v << "." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
index cfa6e1befd8c75322139d5dcdf9a6558caced9ce..278ade2e5ddfcd5129ce43e8e1d3879908c5f614 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
index 9ef7795cdd80e8ba70216028566b2a35cdaa04aa..4197436e12360d648d0c363f4117b247d6775a29 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
index a2ccb8189993580c35cc31e5350b503d0bf4f7f4..bb8f20d2c7909cd9836eed8e04ebe57987328aea 100644
--- a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
@@ -1,7 +1,7 @@
 #include <iostream>
 #include <cstdlib>
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 using namespace TNL;
 using namespace TNL::Containers;
diff --git a/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cd7879902340587c09b8d7a6acc468674df993b
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
+   host_input = 1.0;
+   std::cout << "host_input = " << host_input << std::endl;
+   exclusiveScan( host_input, host_output );
+   std::cout << "host_output " << host_output << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
+   cuda_input = 1.0;
+   std::cout << "cuda_input = " << cuda_input << std::endl;
+   exclusiveScan( cuda_input, cuda_output );
+   std::cout << "cuda_output " << cuda_output << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..a0f42394f5250e33680a900222cf37fa7a00f919
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/exclusiveScanExample.cu
@@ -0,0 +1 @@
+exclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33737897d48bf8d0f1295112ae2ee537f81b958d
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_input( 10 ), host_output( 10 );
+   host_input = 1.0;
+   std::cout << "host_input = " << host_input << std::endl;
+   inclusiveScan( host_input, host_output );
+   std::cout << "host_output " << host_output << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_input( 10 ), cuda_output( 10 );
+   cuda_input = 1.0;
+   std::cout << "cuda_input = " << cuda_input << std::endl;
+   inclusiveScan( cuda_input, cuda_output );
+   std::cout << "cuda_output " << cuda_output << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b192a33488ef96b84b1584d08891ff3ec81a0c76
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inclusiveScanExample.cu
@@ -0,0 +1 @@
+inclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..012e4bcb354701ad204782f59eea02e15f628f4f
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_a( 10 );
+   host_a = 1.0;
+   std::cout << "host_a = " << host_a << std::endl;
+   inplaceExclusiveScan( host_a );
+   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_a( 10 );
+   cuda_a = 1.0;
+   std::cout << "cuda_a = " << cuda_a << std::endl;
+   inplaceExclusiveScan( cuda_a );
+   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..b7692b9c763a4cd2e61453fc41257075352945a0
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceExclusiveScanExample.cu
@@ -0,0 +1 @@
+inplaceExclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebf42f247de07c3b2a8c95970cb8b77be02ed13c
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cpp
@@ -0,0 +1,31 @@
+#include <iostream>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/scan.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the prefix sum with an array allocated on CPU.
+    */
+   Array< double, Devices::Host > host_a( 10 );
+   host_a = 1.0;
+   std::cout << "host_a = " << host_a << std::endl;
+   inplaceInclusiveScan( host_a );
+   std::cout << "The prefix sum of the host array is " << host_a << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Array< double, Devices::Cuda > cuda_a( 10 );
+   cuda_a = 1.0;
+   std::cout << "cuda_a = " << cuda_a << std::endl;
+   inplaceInclusiveScan( cuda_a );
+   std::cout << "The prefix sum of the CUDA array is " << cuda_a << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
diff --git a/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..3f1794e21afdd780ee30c9f052a253955fd33be8
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/inplaceInclusiveScanExample.cu
@@ -0,0 +1 @@
+inplaceInclusiveScanExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 35246fe4ef75bcf8af3fc3528ecd711b878f5321..59ab08d3f315de8f92211e4593a917c0ee6b4b1b 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -216,29 +216,35 @@ and exclusive prefix sum of the same sequence is
 [0,1,4,9,16,25,36]
 ```
 
-Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on sumation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in simillar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
+Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on summation, however product or logical operations could be handy as well. In TNL, scan is implemented in similar way as reduction and uses the same functors as the reduction operation. The following example shows how it works:
 
-\includelineno ScanExample.cpp
-
-Scan does not use `fetch` function because the scan must be performed on a vector (the first parameter we pass to the scan). Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the vector elements before the scan, this can be done explicitly and it will not affect the performance significantlty. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element if the operation.
-
-The result looks as:
-
-\include ScanExample.out
+```
+inplaceInclusiveScan( array, 0, array.getSize(), TNL::Plus{} );
+```
 
-Exclusive scan works the same way, we just need to specify it by the second template parameter which is set to `ScanType::Exclusive`. The call of the scan then looks as
+This is equivalent to the following shortened call (the second, third and fourth parameters have a default value):
 
 ```
-Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduction, 0.0 );
+inplaceInclusiveScan( array );
 ```
 
 The complete example looks as follows:
 
-\includelineno ExclusiveScanExample.cpp
+\includelineno inplaceInclusiveScanExample.cpp
+
+Scan does not use `fetch` function because the scan must be performed on an array. Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the array elements before the scan, this can be done explicitly and it will not affect the performance significantly. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element of the operation.
+
+The result looks as:
+
+\include inplaceInclusiveScanExample.out
+
+Exclusive scan works similarly. The complete example looks as follows:
+
+\includelineno inplaceExclusiveScanExample.cpp
 
 And the result looks as:
 
-\include ExclusiveScanExample.out
+\include inplaceExclusiveScanExample.out
 
 ### Segmented scan
 
@@ -272,4 +278,4 @@ In addition to common scan, we need to encode the segments of the input sequence
 
 The result reads as:
 
-\include SegmentedScanExample.out
\ No newline at end of file
+\include SegmentedScanExample.out
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index 72c1f344dcc843344600eaefaac9bd35a7d1f010..a8d0457fc639956847f660bf62fb12ca3fdd62f3 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -10,7 +10,7 @@
 
 #pragma once
 
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include "CommonVectorOperations.h"
 
 namespace TNL {
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 5531b360d913b70a24eeead5bbef2c280c044904..3391f23fa766a1c5627610022c39d38b17cd30ae 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -13,10 +13,12 @@
 #pragma once
 
 #include <stdlib.h> // srand48
+#include <numeric>  // std::partial_sum
 
 #include "../Benchmarks.h"
 
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include "CommonVectorOperations.h"
 #include "VectorOperations.h"
 
@@ -39,6 +41,7 @@ benchmarkVectorOperations( Benchmark & benchmark,
 {
    using HostVector = Containers::Vector< Real, Devices::Host, Index >;
    using CudaVector =  Containers::Vector< Real, Devices::Cuda, Index >;
+   using SequentialView = Containers::VectorView< Real, Devices::Sequential, Index >;
    using HostView = Containers::VectorView< Real, Devices::Host, Index >;
    using CudaView =  Containers::VectorView< Real, Devices::Cuda, Index >;
 
@@ -564,31 +567,133 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
 
    ////
-   // Inclusive scan
-   auto inclusiveScanHost = [&]() {
-      hostVector.scan();
+   // Inplace inclusive scan
+   auto inplaceInclusiveScanHost = [&]() {
+      Algorithms::inplaceInclusiveScan( hostVector );
+   };
+   auto inplaceInclusiveScanSequential = [&]() {
+      SequentialView view;
+      view.bind( hostVector.getData(), hostVector.getSize() );
+      Algorithms::inplaceInclusiveScan( view );
+   };
+   auto inplaceInclusiveScanSTL = [&]() {
+      std::partial_sum( hostVector.getData(), hostVector.getData() + hostVector.getSize(), hostVector.getData() );
+   };
+   benchmark.setOperation( "inclusive scan (inplace)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inplaceInclusiveScanHost );
+   benchmark.time< Devices::Sequential >( reset1, "CPU sequential", inplaceInclusiveScanSequential );
+   benchmark.time< Devices::Sequential >( reset1, "CPU std::partial_sum", inplaceInclusiveScanSTL );
+   // TODO: there are also `std::inclusive_scan` and `std::exclusive_scan` since C++17 which are parallel,
+   // add them to the benchmark when we use C++17
+#ifdef HAVE_CUDA
+   auto inplaceInclusiveScanCuda = [&]() {
+      Algorithms::inplaceInclusiveScan( deviceVector );
+   };
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inplaceInclusiveScanCuda );
+#endif
+
+   ////
+   // Inclusive scan of one vector
+   auto inclusiveScanOneVectorHost = [&]() {
+      Algorithms::inclusiveScan( hostVector, hostVector2 );
+   };
+   benchmark.setOperation( "inclusive scan (1 vector)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanOneVectorHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanOneVectorCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector, deviceVector2 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanOneVectorCuda );
+#endif
+
+   ////
+   // Inclusive scan of two vectors
+   auto inclusiveScanTwoVectorsHost = [&]() {
+      Algorithms::inclusiveScan( hostVector + hostVector2, hostVector3 );
+   };
+   benchmark.setOperation( "inclusive scan (2 vectors)", 3 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanTwoVectorsHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanTwoVectorsCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector + deviceVector2, deviceVector3 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanTwoVectorsCuda );
+#endif
+
+   ////
+   // Inclusive scan of three vectors
+   auto inclusiveScanThreeVectorsHost = [&]() {
+      Algorithms::inclusiveScan( hostVector + hostVector2 + hostVector3, hostVector4 );
+   };
+   benchmark.setOperation( "inclusive scan (3 vectors)", 4 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", inclusiveScanThreeVectorsHost );
+#ifdef HAVE_CUDA
+   auto inclusiveScanThreeVectorsCuda = [&]() {
+      Algorithms::inclusiveScan( deviceVector + deviceVector2 + deviceVector3, deviceVector4 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", inclusiveScanThreeVectorsCuda );
+#endif
+
+   ////
+   // Inplace exclusive scan
+   auto inplaceExclusiveScanHost = [&]() {
+      Algorithms::inplaceExclusiveScan( hostVector );
+   };
+   auto inplaceExclusiveScanSequential = [&]() {
+      SequentialView view;
+      view.bind( hostVector.getData(), hostVector.getSize() );
+      Algorithms::inplaceExclusiveScan( view );
+   };
+   benchmark.setOperation( "exclusive scan (inplace)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inplaceExclusiveScanHost );
+   benchmark.time< Devices::Sequential >( reset1, "CPU sequential", inplaceExclusiveScanSequential );
+#ifdef HAVE_CUDA
+   auto inplaceExclusiveScanCuda = [&]() {
+      Algorithms::inplaceExclusiveScan( deviceVector );
+   };
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inplaceExclusiveScanCuda );
+#endif
+
+   ////
+   // Exclusive scan of one vector
+   auto exclusiveScanOneVectorHost = [&]() {
+      Algorithms::exclusiveScan( hostVector, hostVector2 );
+   };
+   benchmark.setOperation( "exclusive scan (1 vector)", 2 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanOneVectorHost );
+#ifdef HAVE_CUDA
+   auto exclusiveScanOneVectorCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector, deviceVector2 );
+   };
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanOneVectorCuda );
+#endif
+
+   ////
+   // Exclusive scan of two vectors
+   auto exclusiveScanTwoVectorsHost = [&]() {
+      Algorithms::exclusiveScan( hostVector + hostVector2, hostVector3 );
    };
-   benchmark.setOperation( "inclusive scan", 2 * datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", inclusiveScanHost );
+   benchmark.setOperation( "exclusive scan (2 vectors)", 3 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanTwoVectorsHost );
 #ifdef HAVE_CUDA
-   auto inclusiveScanCuda = [&]() {
-      deviceVector.scan();
+   auto exclusiveScanTwoVectorsCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector + deviceVector2, deviceVector3 );
    };
-   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusiveScanCuda );
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanTwoVectorsCuda );
 #endif
 
    ////
-   // Exclusive scan
-   auto exclusiveScanHost = [&]() {
-      hostVector.template scan< Algorithms::ScanType::Exclusive >();
+   // Exclusive scan of three vectors
+   auto exclusiveScanThreeVectorsHost = [&]() {
+      Algorithms::exclusiveScan( hostVector + hostVector2 + hostVector3, hostVector4 );
    };
-   benchmark.setOperation( "exclusive scan", 2 * datasetSize );
-   benchmark.time< Devices::Host >( reset1, "CPU ET", exclusiveScanHost );
+   benchmark.setOperation( "exclusive scan (3 vectors)", 4 * datasetSize );
+   benchmark.time< Devices::Host >( resetAll, "CPU ET", exclusiveScanThreeVectorsHost );
 #ifdef HAVE_CUDA
-   auto exclusiveScanCuda = [&]() {
-      deviceVector.template scan< Algorithms::ScanType::Exclusive >();
+   auto exclusiveScanThreeVectorsCuda = [&]() {
+      Algorithms::exclusiveScan( deviceVector + deviceVector2 + deviceVector3, deviceVector4 );
    };
-   benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusiveScanCuda );
+   benchmark.time< Devices::Cuda >( resetAll, "GPU ET", exclusiveScanThreeVectorsCuda );
 #endif
 
 #ifdef HAVE_CUDA
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
index d33ee47cc778187237378da3ddb3ccda99271511..e1f7f8fa34776e0c356ae420c296808e126f7bd8 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack_impl.h
@@ -13,6 +13,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/BiEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <cstdio>
 
@@ -97,8 +98,7 @@ setCompressedRowLengths( ConstRowsCapacitiesTypeView constRowLengths )
     DeviceDependentCode::performRowBubbleSort( *this, rowLengths );
     DeviceDependentCode::computeColumnSizes( *this, rowLengths );
 
-    //this->groupPointers.computeExclusivePrefixSum();
-    this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+    Algorithms::inplaceExclusiveScan( this->groupPointers );
 
     DeviceDependentCode::verifyRowPerm( *this, rowLengths );
     DeviceDependentCode::verifyRowLengths( *this, rowLengths );
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
index ed5ec486c0178a9d6a967203cdf94396c109e0d3..2cb2b4784d37266cd79dce0968845de0dc3ef705 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/CSR_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/CSR.h>
 #include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
@@ -102,7 +103,7 @@ void CSR< Real, Device, Index, KernelType >::setCompressedRowLengths( ConstRowsC
    rowPtrs.bind( this->rowPointers.getData(), this->getRows() );
    rowPtrs = rowLengths;
    this->rowPointers.setElement( this->rows, 0 );
-   this->rowPointers.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->rowPointers );
    this->maxRowLength = max( rowLengths );
 
    /****
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
index df662277706d326f27afc60c2467752e91d0d817..28bd8313a808458fbb6f7673e9d8b97dad62d9b7 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/ChunkedEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -218,7 +219,7 @@ void ChunkedEllpack< Real, Device, Index >::setCompressedRowLengths( ConstRowsCa
       this->rowPointers.setElement( 0, 0 );
       for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
          this->setSlice( rowLengths, sliceIndex, elementsToAllocation );
-      this->rowPointers.scan();
+      Algorithms::inplaceInclusiveScan( this->rowPointers );
    }
 
    if( std::is_same< Device, Devices::Cuda >::value )
diff --git a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
index c7127cf1fd0e95e979ad1ca8619c9d836bae81a0..4dc0f4480944b2e60ea78c06c415ce95b76bc589 100644
--- a/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
+++ b/src/Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack_impl.h
@@ -12,6 +12,7 @@
 
 #include <Benchmarks/SpMV/ReferenceFormats/Legacy/SlicedEllpack.h>
 #include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Math.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -82,7 +83,7 @@ void SlicedEllpack< Real, Device, Index, SliceSize >::setCompressedRowLengths( C
 
    this->maxRowLength = max( rowLengths );
 
-   this->slicePointers.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( this->slicePointers );
    this->allocateMatrixElements( this->slicePointers.getElement( slices ) );
 }
 
diff --git a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
index f4822b6ca72d605665663fc5be1bd29f1059ab5d..01a5307d70109145f65cf59ee74cf3c5d4389a09 100644
--- a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
+++ b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Real,
         typename Device,
@@ -323,7 +324,7 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
 
            oddEvenBlock= (oddEvenBlock == 0) ? 1: 0;
 
-           calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+           calculateCudaBlocksAgain = Algorithms::contains( blockCalculationIndicator, 1);
           */
   /**------------------------------------------------------------------------------------------------*/
 
@@ -349,7 +350,7 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
           TNL_CHECK_CUDA_DEVICE;
 
           // "Parallel reduction" to see if we should calculate again calculateCudaBlocksAgain
-          calculateCudaBlocksAgain = blockCalculationIndicator.containsValue(1);
+          calculateCudaBlocksAgain = Algorithms::contains( blockCalculationIndicator, 1);
 
           // When we change something then we should caclucate again in the next passage of MPI ( calculated = true )
          if( calculateCudaBlocksAgain ){
diff --git a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
index e0f6ff58adf12156e1034494753d15ed4b446e52..e7f82880c4fb786f365594e73bc566af43f73c62 100644
--- a/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
+++ b/src/Examples/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h
@@ -14,6 +14,7 @@
 #pragma once
 
 #include <TNL/Functions/MeshFunction.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Real,
         typename Device,
@@ -336,8 +337,8 @@ solve( const Meshes::DistributedMeshes::DistributedMesh< MeshType >& distributed
           BlockIterDevice = BlockIterPom;
           Pointers::synchronizeSmartPointersOnDevice< Devices::Cuda >();
 
-          // .containsValue(1) is actually parallel reduction implemented in TNL
-          BlockIterD = BlockIterDevice.containsValue(1);
+          // contains(...) is actually parallel reduction implemented in TNL
+          BlockIterD = Algorithms::contains( BlockIterDevice, 1);
           cudaDeviceSynchronize();
           TNL_CHECK_CUDA_DEVICE;
 
diff --git a/src/TNL/Algorithms/DistributedScan.h b/src/TNL/Algorithms/DistributedScan.h
deleted file mode 100644
index aa7c008a7b6b5ccfe1445daebdc4312976eead0b..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/DistributedScan.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/***************************************************************************
-                          DistributedScan.h  -  description
-                             -------------------
-    begin                : Aug 16, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include <TNL/Algorithms/Scan.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/MPI/Wrappers.h>
-
-namespace TNL {
-namespace Algorithms {
-
-template< ScanType Type >
-struct DistributedScan
-{
-   template< typename DistributedVector,
-             typename Reduction >
-   static void
-   perform( DistributedVector& v,
-            typename DistributedVector::IndexType begin,
-            typename DistributedVector::IndexType end,
-            const Reduction& reduction,
-            const typename DistributedVector::RealType zero )
-   {
-      using RealType = typename DistributedVector::RealType;
-      using DeviceType = typename DistributedVector::DeviceType;
-
-      const auto group = v.getCommunicationGroup();
-      if( group != MPI::NullGroup() ) {
-         // adjust begin and end for the local range
-         const auto localRange = v.getLocalRange();
-         begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
-         end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin();
-
-         // perform first phase on the local data
-         auto localView = v.getLocalView();
-         const auto blockShifts = Scan< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
-         const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
-
-         // exchange local sums between ranks
-         const int nproc = MPI::GetSize( group );
-         RealType dataForScatter[ nproc ];
-         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
-         Containers::Vector< RealType, Devices::Host > rankSums( nproc );
-         // NOTE: exchanging general data types does not work with MPI
-         MPI::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
-
-         // compute the scan of the per-rank sums
-         Scan< Devices::Host, ScanType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
-
-         // perform second phase: shift by the per-block and per-rank offsets
-         const int rank = MPI::GetRank( group );
-         Scan< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
-      }
-   }
-};
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h
index 42c37f062c62e7ae7108fe0646bbde72d4dd9d15..56c3498bfe62a59025fcb490b083ec4dd7e42109 100644
--- a/src/TNL/Algorithms/MemoryOperations.h
+++ b/src/TNL/Algorithms/MemoryOperations.h
@@ -81,20 +81,6 @@ struct MemoryOperations< Devices::Sequential >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   __cuda_callable__
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   __cuda_callable__
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 template<>
@@ -155,18 +141,6 @@ struct MemoryOperations< Devices::Host >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 template<>
@@ -224,18 +198,6 @@ struct MemoryOperations< Devices::Cuda >
    static bool compare( const Element1* destination,
                         const Element2* source,
                         const Index size );
-
-   template< typename Element,
-             typename Index >
-   static bool containsValue( const Element* data,
-                              const Index size,
-                              const Element& value );
-
-   template< typename Element,
-             typename Index >
-   static bool containsOnlyValue( const Element* data,
-                                  const Index size,
-                                  const Element& value );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 626847eba38d4a944796a5989b870c7a44515d43..4c84ec6975c7336de855626d2fe784c6fc519cc2 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -17,7 +17,7 @@
 #include <TNL/Algorithms/MemoryOperations.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
@@ -185,37 +185,5 @@ compare( const Element1* destination,
    return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Cuda >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, (Index) 0, "" );
-
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
-}
-
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Cuda >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index abebd9d156b8d2bbd27c566d65fe799f8a040b8c..7dd2ef1ba7663b556b7e02ba9949ccef57ca4fc5 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -16,7 +16,7 @@
 
 #include <TNL/Algorithms/MemoryOperations.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Algorithms {
@@ -169,49 +169,5 @@ compare( const DestinationElement* destination,
    }
 }
 
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Host >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
-      auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
-      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
-   }
-   else {
-      // sequential algorithm can return as soon as it finds a match
-      return MemoryOperations< Devices::Sequential >::containsValue( data, size, value );
-   }
-}
-
-template< typename Element,
-          typename Index >
-bool
-MemoryOperations< Devices::Host >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
-      auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
-      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
-   }
-   else {
-      // sequential algorithm can return as soon as it finds a mismatch
-      return MemoryOperations< Devices::Sequential >::containsOnlyValue( data, size, value );
-   }
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/MemoryOperationsSequential.hpp b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
index 2ea21d0aca0ead7ab3bcf7d6b95a96599d1f27e3..dd7765decdc66c672053064f83037b739c25bbe9 100644
--- a/src/TNL/Algorithms/MemoryOperationsSequential.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
@@ -147,43 +147,5 @@ compare( const Element1* destination,
    return true;
 }
 
-template< typename Element,
-          typename Index >
-__cuda_callable__
-bool
-MemoryOperations< Devices::Sequential >::
-containsValue( const Element* data,
-               const Index size,
-               const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   for( Index i = 0; i < size; i++ )
-      if( data[ i ] == value )
-         return true;
-   return false;
-}
-
-template< typename Element,
-          typename Index >
-__cuda_callable__
-bool
-MemoryOperations< Devices::Sequential >::
-containsOnlyValue( const Element* data,
-                   const Index size,
-                   const Element& value )
-{
-   if( size == 0 ) return false;
-   TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
-   TNL_ASSERT_GE( size, 0, "" );
-
-   for( Index i = 0; i < size; i++ )
-      if( ! ( data[ i ] == value ) )
-         return false;
-   return true;
-}
-
 } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Multireduction.h b/src/TNL/Algorithms/Multireduction.h
index 8e63fa7eabce3e8d9d837770794d991fd12705e7..9e50afdb74cd52896121499c308182499037ef22 100644
--- a/src/TNL/Algorithms/Multireduction.h
+++ b/src/TNL/Algorithms/Multireduction.h
@@ -29,7 +29,9 @@ struct Multireduction< Devices::Sequential >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -45,7 +47,7 @@ struct Multireduction< Devices::Sequential >
              typename Reduction,
              typename Index >
    static constexpr void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
@@ -58,7 +60,9 @@ struct Multireduction< Devices::Host >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -74,7 +78,7 @@ struct Multireduction< Devices::Host >
              typename Reduction,
              typename Index >
    static void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
@@ -87,7 +91,9 @@ struct Multireduction< Devices::Cuda >
 {
    /**
     * Parameters:
-    *    zero: starting value for reduction
+    *    identity: the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *              for the reduction operation, i.e. element which does not
+    *              change the result of the reduction
     *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
@@ -103,7 +109,7 @@ struct Multireduction< Devices::Cuda >
              typename Reduction,
              typename Index >
    static void
-   reduce( const Result zero,
+   reduce( const Result identity,
            DataFetcher dataFetcher,
            const Reduction reduction,
            const Index size,
diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp
index 4eb8a93695ecd1a7ee1792873745c7c93486c207..ca7eec8d2dcd8190da4bef06db5e69ceb3475d43 100644
--- a/src/TNL/Algorithms/Multireduction.hpp
+++ b/src/TNL/Algorithms/Multireduction.hpp
@@ -35,7 +35,7 @@ template< typename Result,
           typename Index >
 void constexpr
 Multireduction< Devices::Sequential >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -53,7 +53,7 @@ reduce( const Result zero,
       // (it is accessed as a row-major matrix with n rows and 4 columns)
       Result r[ n * 4 ];
       for( int k = 0; k < n * 4; k++ )
-         r[ k ] = zero;
+         r[ k ] = identity;
 
       // main reduction (explicitly unrolled loop)
       for( int b = 0; b < blocks; b++ ) {
@@ -89,7 +89,7 @@ reduce( const Result zero,
    }
    else {
       for( int k = 0; k < n; k++ )
-         result[ k ] = zero;
+         result[ k ] = identity;
 
       for( int b = 0; b < blocks; b++ ) {
          const Index offset = b * block_size;
@@ -112,7 +112,7 @@ template< typename Result,
           typename Index >
 void
 Multireduction< Devices::Host >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -134,14 +134,14 @@ reduce( const Result zero,
          #pragma omp single nowait
          {
             for( int k = 0; k < n; k++ )
-               result[ k ] = zero;
+               result[ k ] = identity;
          }
 
          // initialize array for thread-local results
          // (it is accessed as a row-major matrix with n rows and 4 columns)
          Result r[ n * 4 ];
          for( int k = 0; k < n * 4; k++ )
-            r[ k ] = zero;
+            r[ k ] = identity;
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
@@ -185,7 +185,7 @@ reduce( const Result zero,
    }
    else
 #endif
-      Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
+      Multireduction< Devices::Sequential >::reduce( identity, dataFetcher, reduction, size, n, result );
 }
 
 template< typename Result,
@@ -194,7 +194,7 @@ template< typename Result,
           typename Index >
 void
 Multireduction< Devices::Cuda >::
-reduce( const Result zero,
+reduce( const Result identity,
         DataFetcher dataFetcher,
         const Reduction reduction,
         const Index size,
@@ -212,7 +212,7 @@ reduce( const Result zero,
 
    // start the reduction on the GPU
    Result* deviceAux1 = nullptr;
-   const int reducedSize = detail::CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
+   const int reducedSize = detail::CudaMultireductionKernelLauncher( identity, dataFetcher, reduction, size, n, deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
@@ -234,7 +234,7 @@ reduce( const Result zero,
 
    // finish the reduction on the host
    auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
-   Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
+   Multireduction< Devices::Sequential >::reduce( identity, dataFetcherFinish, reduction, reducedSize, n, hostResult );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
deleted file mode 100644
index da6bca882d17d6bb702ccba53be30c672757ab40..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Reduction.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/***************************************************************************
-                          Reduction.h  -  description
-                             -------------------
-    begin                : Oct 28, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
-
-#pragma once
-
-#include <utility>  // std::pair, std::forward
-
-#include <TNL/Functional.h>  // extension of STL functionals for reduction
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Algorithms/detail/Reduction.h>
-
-namespace TNL {
-   namespace Algorithms {
-
-/**
- * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
- *
- * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
- * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
- * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
- * position of the smallest or the largest element, reduction with argument can be used.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Index is a type for indexing.
- * \tparam Result is a type of the reduction result.
- * \tparam Fetch is a lambda function for fetching the input data.
- * \tparam Reduce is a lambda function performing the reduction.
- *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
- *
- * \param begin defines range [begin, end) of indexes which will be used for the reduction.
- * \param end defines range [begin, end) of indexes which will be used for the reduction.
- * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
- * \return result of the reduction
- *
- * The `fetch` lambda function takes one argument which is index of the element to be fetched:
- *
- * ```
- * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
- * ```
- *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
- *
- * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
- * ```
- *
- * \par Example
- *
- * \include ReductionAndScan/SumExampleWithLambda.cpp
- *
- * \par Output
- *
- * \include SumExampleWithLambda.out
- */
-template< typename Device,
-          typename Index,
-          typename Result,
-          typename Fetch,
-          typename Reduce >
-Result reduce( const Index begin,
-               const Index end,
-               Fetch&& fetch,
-               Reduce&& reduce,
-               const Result& zero )
-{
-    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
-}
-
-/**
- * \brief Variant of \ref TNL::Algorithms::reduce with functional instead of reduction lambda function.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Index is a type for indexing.
- * \tparam Fetch is a lambda function for fetching the input data.
- * \tparam Reduce is a functional performing the reduction.
- *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
- *
- * \e Reduce can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
- *    \ref TNL::LogicalOr, \ref TNL::BitAnd or \ref TNL::BitOr.
- *
- * \param begin defines range [begin, end) of indexes which will be used for the reduction.
- * \param end defines range [begin, end) of indexes which will be used for the reduction.
- * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation.
- * \return result of the reduction
- *
- * The `fetch` lambda function takes one argument which is index of the element to be fetched:
- *
- * ```
- * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
- * ```
- *
- * \par Example
- *
- * \include ReductionAndScan/SumExampleWithFunctional.cpp
- *
- * \par Output
- *
- * \include SumExampleWithFunctional.out
- */
-template< typename Device,
-          typename Index,
-          typename Fetch,
-          typename Reduce >
-auto reduce( const Index begin,
-             const Index end,
-             Fetch&& fetch,
-             Reduce&& reduce )
-{
-   using Result = decltype( fetch( ( Index ) 0 ) );
-   return detail::Reduction< Device >::reduce( begin,
-                                               end,
-                                               std::forward< Fetch >( fetch ),
-                                               std::forward< Reduce >( reduce ),
-                                               reduce.template getIdempotent< Result >() );
-}
-
-/**
- * \brief Variant of \ref TNL::Algorithms::reduce returning also a position of an element of interest.
- *
- * For example in case of computing minimal or maximal element in array/vector,
- * the position of the element having given value can be obtained. The use of this method
- * is, however, more flexible.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Index is a type for indexing.
- * \tparam Result is a type of the reduction result.
- * \tparam Reduce is a lambda function performing the reduction.
- * \tparam Fetch is a lambda function for fetching the input data.
- *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
- *
- * \param begin defines range [begin, end) of indexes which will be used for the reduction.
- * \param end defines range [begin, end) of indexes which will be used for the reduction.
- * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
- * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
- *         is the element position and `pair.second` is the reduction result.
- *
- * The `fetch` lambda function takes one argument which is index of the element to be fetched:
- *
- * ```
- * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
- * ```
- *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
- *
- * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
- * ```
- *
- * \par Example
- *
- * \include ReductionAndScan/ReductionWithArgument.cpp
- *
- * \par Output
- *
- * \include ReductionWithArgument.out
- */
-template< typename Device,
-          typename Index,
-          typename Result,
-          typename Fetch,
-          typename Reduce >
-std::pair< Result, Index >
-reduceWithArgument( const Index begin,
-                    const Index end,
-                    Fetch&& fetch,
-                    Reduce&& reduce,
-                    const Result& zero )
-{
-    return detail::Reduction< Device >::reduceWithArgument( begin,
-                                                            end,
-                                                            std::forward< Fetch >( fetch ),
-                                                            std::forward< Reduce >( reduce ),
-                                                            zero );
-}
-
-/**
- * \brief Variant of \ref TNL::Algorithms::reduceWithArgument with functional instead of reduction lambda function.
- *
- * For example in case of computing minimal or maximal element in array/vector,
- * the position of the element having given value can be obtained. The use of this method
- * is, however, more flexible.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Index is a type for indexing.
- * \tparam Result is a type of the reduction result.
- * \tparam Reduce is a functional performing the reduction.
- * \tparam Fetch is a lambda function for fetching the input data.
- *
- * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
- *
- * \e Reduce can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
- * \param begin defines range [begin, end) of indexes which will be used for the reduction.
- * \param end defines range [begin, end) of indexes which will be used for the reduction.
- * \param fetch is a lambda function fetching the input data.
- * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
- * \param zero is the idempotent element for the reduction operation, i.e. element which
- *             does not change the result of the reduction.
- * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
- *         is the element position and `pair.second` is the reduction result.
- *
- * The `fetch` lambda function takes one argument which is index of the element to be fetched:
- *
- * ```
- * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
- * ```
- *
- * The `reduce` lambda function takes two variables which are supposed to be reduced:
- *
- * ```
- * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
- * ```
- *
- * \par Example
- *
- * \include ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
- *
- * \par Output
- *
- * \include ReductionWithArgumentWithFunctional.out
- */
-template< typename Device,
-          typename Index,
-          typename Fetch,
-          typename Reduce >
-auto
-reduceWithArgument( const Index begin,
-                    const Index end,
-                    Fetch&& fetch,
-                    Reduce&& reduce )
-{
-   using Result = decltype( fetch( ( Index ) 0 ) );
-   return detail::Reduction< Device >::reduceWithArgument( begin,
-                                                           end,
-                                                           std::forward< Fetch >( fetch ),
-                                                           std::forward< Reduce >( reduce ),
-                                                           reduce.template getIdempotent< Result >() );
-}
-
-   } // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h
deleted file mode 100644
index 81a5d2f7e753b64391e134e93a0c5bb652e54310..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Scan.h
+++ /dev/null
@@ -1,417 +0,0 @@
-/***************************************************************************
-                          Scan.h  -  description
-                             -------------------
-    begin                : May 9, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
-
-#pragma once
-
-#include <TNL/Devices/Sequential.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-
-namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief Scan (or prefix sum) type - inclusive or exclusive.
- *
- * See \ref TNL::Algorithms::Scan.
- */
-enum class ScanType {
-   Exclusive,
-   Inclusive
-};
-
-/**
- * \brief Computes scan (or prefix sum) on a vector.
- *
- * [Scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum) operation turns a sequence
- * \f$a_1, \ldots, a_n\f$ into a sequence \f$s_1, \ldots, s_n\f$ defined as
- *
- * \f[
- * s_i = \sum_{j=1}^i a_i.
- * \f]
- * Exclusive scan (or prefix sum) is defined as
- *
- * \f[
- * \sigma_i = \sum_{j=1}^{i-1} a_i.
- * \f]
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
- *
- * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
- */
-template< typename Device,
-          ScanType Type = ScanType::Inclusive >
-struct Scan;
-
-/**
- * \brief Computes segmented scan (or prefix sum) on a vector.
- *
- * Segmented scan is a modification of common scan. In this case the sequence of
- * numbers in hand is divided into segments like this, for example
- *
- * ```
- * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
- * ```
- *
- * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
- *
- * ```
- * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
- * ```
- *
- * and for exclusive segmented prefix sum it is
- *
- * ```
- * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
- * ```
- *
- * In addition to common scan, we need to encode the segments of the input sequence.
- * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
- * beginning of each segment and `0` on all other positions. In our example, it would be like this:
- *
- * ```
- * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
- * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
- *
- * ```
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
- *
- * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
- *
- * **Note: Segmented scan is not implemented for CUDA yet.**
- */
-template< typename Device,
-          ScanType Type = ScanType::Inclusive >
-struct SegmentedScan;
-
-
-template< ScanType Type >
-struct Scan< Devices::Sequential, Type >
-{
-   /**
-    * \brief Computes scan (prefix sum) sequentially.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction >
-   static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename Reduction >
-   static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-   static void
-   performSecondPhase( Vector& v,
-                       const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::RealType shift );
-};
-
-template< ScanType Type >
-struct Scan< Devices::Host, Type >
-{
-   /**
-    * \brief Computes scan (prefix sum) using OpenMP.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction >
-   static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename Reduction >
-   static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-   static void
-   performSecondPhase( Vector& v,
-                       const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::RealType shift );
-};
-
-template< ScanType Type >
-struct Scan< Devices::Cuda, Type >
-{
-   /**
-    * \brief Computes scan (prefix sum) on GPU.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include ScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction >
-   static void
-   perform( Vector& v,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename Reduction >
-   static auto
-   performFirstPhase( Vector& v,
-                      const typename Vector::IndexType begin,
-                      const typename Vector::IndexType end,
-                      const Reduction& reduction,
-                      const typename Vector::RealType zero );
-
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-   static void
-   performSecondPhase( Vector& v,
-                       const BlockShifts& blockShifts,
-                       const typename Vector::IndexType begin,
-                       const typename Vector::IndexType end,
-                       const Reduction& reduction,
-                       const typename Vector::RealType shift );
-};
-
-template< ScanType Type >
-struct SegmentedScan< Devices::Sequential, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) sequentially.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-};
-
-template< ScanType Type >
-struct SegmentedScan< Devices::Host, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) using OpenMP.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-};
-
-template< ScanType Type >
-struct SegmentedScan< Devices::Cuda, Type >
-{
-   /**
-    * \brief Computes segmented scan (prefix sum) on GPU.
-    *
-    * \tparam Vector type vector being used for the scan.
-    * \tparam Reduction lambda function defining the reduction operation
-    * \tparam Flags array type containing zeros and ones defining the segments begining
-    *
-    * \param v input vector, the result of scan is stored in the same vector
-    * \param flags is an array with zeros and ones defining the segments begining
-    * \param begin the first element in the array to be scanned
-    * \param end the last element in the array to be scanned
-    * \param reduction lambda function implementing the reduction operation
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SegmentedScanExample.cpp
-    *
-    * \par Output
-    *
-    * \include SegmentedScanExample.out
-    *
-    * **Note: Segmented scan is not implemented for CUDA yet.**
-    */
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-   static void
-   perform( Vector& v,
-            Flags& flags,
-            const typename Vector::IndexType begin,
-            const typename Vector::IndexType end,
-            const Reduction& reduction,
-            const typename Vector::RealType zero );
-};
-
-} // namespace Algorithms
-} // namespace TNL
-
-#include <TNL/Algorithms/Scan.hpp>
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
deleted file mode 100644
index 78d5eaf60ecc50833552feac667d0eb941f06bfb..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Scan.hpp
+++ /dev/null
@@ -1,378 +0,0 @@
-/***************************************************************************
-                          Scan.hpp  -  description
-                             -------------------
-    begin                : Mar 24, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
-
-#pragma once
-
-#include "Scan.h"
-
-#include <TNL/Assert.h>
-#include <TNL/Containers/Array.h>
-#include <TNL/Containers/StaticArray.h>
-#include <TNL/Algorithms/detail/CudaScanKernel.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-
-namespace TNL {
-namespace Algorithms {
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-void
-Scan< Devices::Sequential, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-   // sequential prefix-sum does not need a second phase
-   performFirstPhase( v, begin, end, reduction, zero );
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-auto
-Scan< Devices::Sequential, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::RealType zero )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
-//   return Containers::StaticArray< 1, RealType > block_sums;
-   Containers::Array< RealType, Devices::Host > block_sums( 1 );
-   block_sums[ 0 ] = zero;
-
-   if( Type == ScanType::Inclusive ) {
-      for( IndexType i = begin + 1; i < end; i++ )
-         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
-      block_sums[ 0 ] = v[ end - 1 ];
-   }
-   else // Exclusive prefix sum
-   {
-      RealType aux = zero;
-      for( IndexType i = begin; i < end; i++ ) {
-         const RealType x = v[ i ];
-         v[ i ] = aux;
-         aux = reduction( aux, x );
-      }
-      block_sums[ 0 ] = aux;
-   }
-
-   return block_sums;
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-void
-Scan< Devices::Sequential, Type >::
-performSecondPhase( Vector& v,
-                    const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::RealType shift )
-{
-   using IndexType = typename Vector::IndexType;
-
-   for( IndexType i = begin; i < end; i++ )
-      v[ i ] = reduction( v[ i ], shift );
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-void
-Scan< Devices::Host, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-#ifdef HAVE_OPENMP
-   if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() >= 2 ) {
-      const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
-      performSecondPhase( v, blockShifts, begin, end, reduction, zero );
-   }
-   else
-      Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
-#else
-   Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-auto
-Scan< Devices::Host, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::RealType zero )
-{
-#ifdef HAVE_OPENMP
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   const int threads = Devices::Host::getMaxThreadsCount();
-   Containers::Array< RealType > block_sums( threads + 1 );
-   block_sums[ 0 ] = zero;
-
-   #pragma omp parallel num_threads(threads)
-   {
-      // init
-      const int thread_idx = omp_get_thread_num();
-      RealType block_sum = zero;
-
-      // perform prefix-sum on blocks statically assigned to threads
-      if( Type == ScanType::Inclusive ) {
-         #pragma omp for schedule(static)
-         for( IndexType i = begin; i < end; i++ ) {
-            block_sum = reduction( block_sum, v[ i ] );
-            v[ i ] = block_sum;
-         }
-      }
-      else {
-         #pragma omp for schedule(static)
-         for( IndexType i = begin; i < end; i++ ) {
-            const RealType x = v[ i ];
-            v[ i ] = block_sum;
-            block_sum = reduction( block_sum, x );
-         }
-      }
-
-      // write the block sums into the buffer
-      block_sums[ thread_idx + 1 ] = block_sum;
-   }
-
-   // block_sums now contains sums of numbers in each block. The first phase
-   // ends by computing prefix-sum of this array.
-   for( int i = 1; i < threads + 1; i++ )
-      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
-
-   // block_sums now contains shift values for each block - to be used in the second phase
-   return block_sums;
-#else
-   return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-void
-Scan< Devices::Host, Type >::
-performSecondPhase( Vector& v,
-                    const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::RealType shift )
-{
-#ifdef HAVE_OPENMP
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   const int threads = blockShifts.getSize() - 1;
-
-   // launch exactly the same number of threads as in the first phase
-   #pragma omp parallel num_threads(threads)
-   {
-      const int thread_idx = omp_get_thread_num();
-      const RealType offset = reduction( blockShifts[ thread_idx ], shift );
-
-      // shift intermediate results by the offset
-      #pragma omp for schedule(static)
-      for( IndexType i = begin; i < end; i++ )
-         v[ i ] = reduction( v[ i ], offset );
-   }
-#else
-   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, shift );
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-void
-Scan< Devices::Cuda, Type >::
-perform( Vector& v,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-#ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::perform(
-      end - begin,
-      &v.getData()[ begin ],  // input
-      &v.getData()[ begin ],  // output
-      reduction,
-      zero );
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction >
-auto
-Scan< Devices::Cuda, Type >::
-performFirstPhase( Vector& v,
-                   const typename Vector::IndexType begin,
-                   const typename Vector::IndexType end,
-                   const Reduction& reduction,
-                   const typename Vector::RealType zero )
-{
-#ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   return detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
-      end - begin,
-      &v.getData()[ begin ],  // input
-      &v.getData()[ begin ],  // output
-      reduction,
-      zero );
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename BlockShifts,
-             typename Reduction >
-void
-Scan< Devices::Cuda, Type >::
-performSecondPhase( Vector& v,
-                    const BlockShifts& blockShifts,
-                    const typename Vector::IndexType begin,
-                    const typename Vector::IndexType end,
-                    const Reduction& reduction,
-                    const typename Vector::RealType shift )
-{
-#ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   detail::CudaScanKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
-      end - begin,
-      &v.getData()[ begin ],  // output
-      blockShifts.getData(),
-      reduction,
-      shift );
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Sequential, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   if( Type == ScanType::Inclusive )
-   {
-      for( IndexType i = begin + 1; i < end; i++ )
-         if( ! flags[ i ] )
-            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
-   }
-   else // Exclusive prefix sum
-   {
-       RealType aux( v[ begin ] );
-      v[ begin ] = zero;
-      for( IndexType i = begin + 1; i < end; i++ )
-      {
-         RealType x = v[ i ];
-         if( flags[ i ] )
-            aux = zero;
-         v[ i ] = aux;
-         aux = reduction( aux, x );
-      }
-   }
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Host, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-#ifdef HAVE_OPENMP
-   // TODO: parallelize with OpenMP
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
-#else
-   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
-#endif
-}
-
-template< ScanType Type >
-   template< typename Vector,
-             typename Reduction,
-             typename Flags >
-void
-SegmentedScan< Devices::Cuda, Type >::
-perform( Vector& v,
-         Flags& flags,
-         const typename Vector::IndexType begin,
-         const typename Vector::IndexType end,
-         const Reduction& reduction,
-         const typename Vector::RealType zero )
-{
-#ifdef HAVE_CUDA
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-}
-
-} // namespace Algorithms
-} // namespace TNL
diff --git a/src/TNL/Algorithms/SegmentedScan.h b/src/TNL/Algorithms/SegmentedScan.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbcf4260b8ad7261c3da4689b64f4f0666dc9c33
--- /dev/null
+++ b/src/TNL/Algorithms/SegmentedScan.h
@@ -0,0 +1,209 @@
+/***************************************************************************
+                          SegmentedScan.h  -  description
+                             -------------------
+    begin                : May 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+// TODO: move this into the detail namespace, create dispatching functions like
+// inplaceInclusiveSegmentedScan, inplaceExclusiveSegmentedScan, etc.
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/detail/ScanType.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes segmented scan (or prefix sum) on a vector.
+ *
+ * Segmented scan is a modification of common scan. In this case the sequence of
+ * numbers in hand is divided into segments like this, for example
+ *
+ * ```
+ * [1,3,5][2,4,6,9][3,5],[3,6,9,12,15]
+ * ```
+ *
+ * and we want to compute inclusive or exclusive scan of each segment. For inclusive segmented prefix sum we get
+ *
+ * ```
+ * [1,4,9][2,6,12,21][3,8][3,9,18,30,45]
+ * ```
+ *
+ * and for exclusive segmented prefix sum it is
+ *
+ * ```
+ * [0,1,4][0,2,6,12][0,3][0,3,9,18,30]
+ * ```
+ *
+ * In addition to common scan, we need to encode the segments of the input sequence.
+ * It is done by auxiliary flags array (it can be array of booleans) having `1` at the
+ * beginning of each segment and `0` on all other positions. In our example, it would be like this:
+ *
+ * ```
+ * [1,0,0,1,0,0,0,1,0,1,0,0, 0, 0]
+ * [1,3,5,2,4,6,9,3,5,3,6,9,12,15]
+ *
+ * ```
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Type parameter says if inclusive or exclusive is scan is to be computed.
+ *
+ * See \ref Scan< Devices::Host, Type > and \ref Scan< Devices::Cuda, Type >.
+ *
+ * **Note: Segmented scan is not implemented for CUDA yet.**
+ */
+template< typename Device,
+          detail::ScanType Type = detail::ScanType::Inclusive >
+struct SegmentedScan;
+
+template< detail::ScanType Type >
+struct SegmentedScan< Devices::Sequential, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) sequentially.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType identity );
+};
+
+template< detail::ScanType Type >
+struct SegmentedScan< Devices::Host, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) using OpenMP.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType identity );
+};
+
+template< detail::ScanType Type >
+struct SegmentedScan< Devices::Cuda, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) on GPU.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+    *                 for the reduction operation, i.e. element which does not
+    *                 change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    *
+    * **Note: Segmented scan is not implemented for CUDA yet.**
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::ValueType identity );
+};
+
+} // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/SegmentedScan.hpp>
diff --git a/src/TNL/Algorithms/SegmentedScan.hpp b/src/TNL/Algorithms/SegmentedScan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..27e5efe71099d4c088fe7811091831a142743a76
--- /dev/null
+++ b/src/TNL/Algorithms/SegmentedScan.hpp
@@ -0,0 +1,104 @@
+/***************************************************************************
+                          SegmentedScan.hpp  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include "SegmentedScan.h"
+
+#include <TNL/Exceptions/NotImplementedError.h>
+
+namespace TNL {
+namespace Algorithms {
+
+template< detail::ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Sequential, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType identity )
+{
+   using ValueType = typename Vector::ValueType;
+   using IndexType = typename Vector::IndexType;
+
+   if( Type == detail::ScanType::Inclusive )
+   {
+      for( IndexType i = begin + 1; i < end; i++ )
+         if( ! flags[ i ] )
+            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
+   }
+   else // Exclusive scan
+   {
+      ValueType aux( v[ begin ] );
+      v[ begin ] = identity;
+      for( IndexType i = begin + 1; i < end; i++ )
+      {
+         ValueType x = v[ i ];
+         if( flags[ i ] )
+            aux = identity;
+         v[ i ] = aux;
+         aux = reduction( aux, x );
+      }
+   }
+}
+
+template< detail::ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Host, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType identity )
+{
+#ifdef HAVE_OPENMP
+   // TODO: parallelize with OpenMP
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
+#else
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, identity );
+#endif
+}
+
+template< detail::ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Cuda, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::ValueType identity )
+{
+#ifdef HAVE_CUDA
+   using ValueType = typename Vector::ValueType;
+   using IndexType = typename Vector::IndexType;
+
+   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/BiEllpack.hpp b/src/TNL/Algorithms/Segments/BiEllpack.hpp
index d0847b6a3db19e13858af4a668b951360d3b50df..53a3eb905c16f994bf6b3ced08df2ea48e127804 100644
--- a/src/TNL/Algorithms/Segments/BiEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpack.hpp
@@ -13,6 +13,7 @@
 #include <math.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/BiEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -300,7 +301,7 @@ verifyRowLengths( const SizesHolder& segmentsSizes )
       const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
       IndexType elementPtr = begin;
       IndexType rowLength = 0;
-      const IndexType groupsCount = details::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
+      const IndexType groupsCount = detail::BiEllpack< Index, Device, Organization, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
       for( IndexType group = 0; group < groupsCount; group++ )
       {
          std::cerr << "groupIdx = " << group << " groupLength = " << this->getGroupLength( strip, group ) << std::endl;
@@ -345,7 +346,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
       this->performRowBubbleSort( segmentsSizes );
       this->computeColumnSizes( segmentsSizes );
 
-      this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();
+      inplaceExclusiveScan( this->groupPointers );
 
       this->verifyRowPerm( segmentsSizes );
       //this->verifyRowLengths( segmentsSizes ); // TODO: I am not sure what this test is doing.
@@ -385,7 +386,7 @@ template< typename Device,
 auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::BiEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+   return detail::BiEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
       rowPermArray.getConstView(),
       groupPointers.getConstView(),
       segmentIdx );
@@ -421,7 +422,7 @@ template< typename Device,
 __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
 {
-      return details::BiEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::BiEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowPermArray.getConstView(),
          groupPointers.getConstView(),
          segmentIdx,
@@ -587,7 +588,7 @@ template< typename Device,
 auto BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >::
 getStripLength( const IndexType stripIdx ) const -> IndexType
 {
-   return details::BiEllpack< Index, Device, Organization, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
+   return detail::BiEllpack< Index, Device, Organization, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.h b/src/TNL/Algorithms/Segments/BiEllpackView.h
index 44629ea719e9882de532b6bdc8d66cd1ae9d2435..50f69e3aa2266b65df3bc6f089e2f05477ea0ae3 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.h
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.h
@@ -15,7 +15,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -207,7 +207,7 @@ class BiEllpackView
                                              Args_... args );
 
       template< typename Index_, typename Fetch_, int BlockDim_, int WarpSize_, bool B_ >
-      friend struct details::BiEllpackSegmentsReductionDispatcher;
+      friend struct detail::BiEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/BiEllpackView.hpp b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
index e861e8f76605ffc73679c2ef05c1a126686b23a1..03131a0de193e9a926b11ace51cb9bbdd8a97e52 100644
--- a/src/TNL/Algorithms/Segments/BiEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/BiEllpackView.hpp
@@ -13,8 +13,9 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/BiEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-//#include <TNL/Algorithms/Segments/details/BiEllpack.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+//#include <TNL/Algorithms/Segments/detail/BiEllpack.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -157,19 +158,19 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSize(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSize(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentSizeDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
@@ -205,13 +206,13 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
          rowPermArray,
          groupPointers,
          segmentIdx,
          localIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndex(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndex(
          rowPermArray,
          groupPointers,
          segmentIdx,
@@ -219,7 +220,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getGlobalIndexDirect(
          rowPermArray,
          groupPointers,
          segmentIdx,
@@ -238,19 +239,19 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentView(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentView(
          rowPermArray,
          groupPointers,
          segmentIdx );
 #endif
    }
    else
-      return details::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
+      return detail::BiEllpack< IndexType, DeviceType, Organization, WarpSize >::getSegmentViewDirect(
          rowPermArray,
          groupPointers,
          segmentIdx );
@@ -271,7 +272,7 @@ forElements( IndexType first, IndexType last, Function&& f ) const
       const IndexType strip = segmentIdx / getWarpSize();
       const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
       const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize();
-      const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
+      const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( segmentsPermutationView, segmentIdx );
       IndexType groupHeight = getWarpSize();
       //printf( "segmentIdx = %d strip = %d firstGroupInStrip = %d rowStripPerm = %d groupsCount = %d \n", segmentIdx, strip, firstGroupInStrip, rowStripPerm, groupsCount );
       bool compute( true );
@@ -356,7 +357,7 @@ void
 BiEllpackView< Device, Index, Organization, WarpSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( this->getStorageSize() == 0 )
       return;
    if( std::is_same< DeviceType, Devices::Host >::value )
@@ -365,7 +366,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          const IndexType stripIdx = segmentIdx / getWarpSize();
          const IndexType groupIdx = stripIdx * ( getLogWarpSize() + 1 );
          const IndexType inStripIdx = rowPermArray[ segmentIdx ] - stripIdx * getWarpSize();
-         const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
+         const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCount( rowPermArray, segmentIdx );
          IndexType globalIdx = groupPointers[ groupIdx ];
          IndexType groupHeight = getWarpSize();
          IndexType localIdx( 0 );
@@ -379,7 +380,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          //          << std::endl;
          for( IndexType group = 0; group < groupsCount && compute; group++ )
          {
-            const IndexType groupSize = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
+            const IndexType groupSize = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getGroupSize( groupPointers, stripIdx, group );
             IndexType groupWidth = groupSize / groupHeight;
             const IndexType globalIdxBack = globalIdx;
             //std::cerr << "  groupSize = " << groupSize
@@ -391,11 +392,11 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
                globalIdx += inStripIdx;
             for( IndexType j = 0; j < groupWidth && compute; j++ )
             {
-               //std::cerr << "    segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx 
+               //std::cerr << "    segmentIdx = " << segmentIdx << " groupIdx = " << groupIdx
                //         << " groupWidth = " << groupWidth << " groupHeight = " << groupHeight
-               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx 
-               //          << " fetch = " << details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
-               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               //          << " localIdx = " << localIdx << " globalIdx = " << globalIdx
+               //          << " fetch = " << detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) << std::endl;
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
                if( Organization == RowMajorOrder )
                   globalIdx ++;
                else
@@ -424,7 +425,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          dim3 cudaGridSize = Cuda::getMaxGridSize();
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
+         detail::BiEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, BlockDim, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
          cudaThreadSynchronize();
@@ -496,7 +497,7 @@ printStructure( std::ostream& str ) const
       {
          const IndexType groupSize = groupPointers.getElement( groupIdx + 1 ) - groupPointers.getElement( groupIdx );
          const IndexType groupWidth = groupSize / groupHeight;
-         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight 
+         str << "\tGroup: " << groupIdx << " size = " << groupSize << " width = " << groupWidth << " height = " << groupHeight
              << " offset = " << groupPointers.getElement( groupIdx ) << std::endl;
          groupHeight /= 2;
       }
@@ -534,7 +535,7 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
    const IndexType strip = segmentIdx / getWarpSize();
    const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 );
    const IndexType rowStripPerm = rowPermArray[ segmentIdx ] - strip * getWarpSize();
-   const IndexType groupsCount = details::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
+   const IndexType groupsCount = detail::BiEllpack< IndexType, DeviceType, Organization, getWarpSize() >::getActiveGroupsCountDirect( rowPermArray, segmentIdx );
    IndexType groupHeight = getWarpSize();
    bool compute( true );
    IndexType localIdx( 0 );
@@ -607,13 +608,13 @@ segmentsReductionKernel( IndexType gridIdx,
 
    /////
    // Fetch group pointers to shared memory
-   //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip ); 
+   //bool b1 = ( threadIdx.x <= warpsCount * groupsInStrip );
    //bool b2 = ( firstGroupIdx + threadIdx.x % groupsInStrip < this->groupPointers.getSize() );
    //printf( "tid = %d warpsCount * groupsInStrip = %d firstGroupIdx + threadIdx.x = %d this->groupPointers.getSize() = %d read = %d %d\n",
    //   threadIdx.x, warpsCount * groupsInStrip,
    //   firstGroupIdx + threadIdx.x,
    //   this->groupPointers.getSize(), ( int ) b1, ( int ) b2 );
-   if( threadIdx.x <= warpsCount * groupsInStrip && 
+   if( threadIdx.x <= warpsCount * groupsInStrip &&
       firstGroupInBlock + threadIdx.x < this->groupPointers.getSize() )
    {
       sharedGroupPointers[ threadIdx.x ] = this->groupPointers[ firstGroupInBlock + threadIdx.x ];
@@ -634,7 +635,7 @@ segmentsReductionKernel( IndexType gridIdx,
          IndexType groupEnd = sharedGroupPointers[ sharedGroupOffset + group + 1 ];
          TNL_ASSERT_LT( groupBegin, this->getStorageSize(), "" );
          //if( groupBegin >= this->getStorageSize() )
-         //   printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n", 
+         //   printf( "tid = %d sharedGroupOffset + group + 1 = %d strip = %d group = %d groupBegin = %d groupEnd = %d this->getStorageSize() = %d\n",
          //      threadIdx.x, sharedGroupOffset + group + 1, strip, group, groupBegin, groupEnd, this->getStorageSize() );
          TNL_ASSERT_LT( groupEnd, this->getStorageSize(), "" );
          if( groupEnd - groupBegin > 0 )
@@ -675,7 +676,7 @@ segmentsReductionKernel( IndexType gridIdx,
             {
                temp[ threadIdx.x ] = reduction( temp[ threadIdx.x ], fetch( globalIdx, compute ) );
                //if( strip == 1 )
-               //   printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] );               
+               //   printf( "tid %d fetch %f temp %f \n", threadIdx.x, fetch( globalIdx, compute ), temp[ threadIdx.x ] );
                globalIdx += getWarpSize();
             }
             // TODO: reduction via templates
diff --git a/src/TNL/Algorithms/Segments/CSR.hpp b/src/TNL/Algorithms/Segments/CSR.hpp
index 823393c2f2e1cb7bcbed638701157cca665df988..44f9aa799cb3ce29a6f2b35ef8b78b9030b663ba 100644
--- a/src/TNL/Algorithms/Segments/CSR.hpp
+++ b/src/TNL/Algorithms/Segments/CSR.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSR.h>
-#include <TNL/Algorithms/Segments/details/CSR.h>
+#include <TNL/Algorithms/Segments/detail/CSR.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -91,7 +91,7 @@ void
 CSR< Device, Index, Kernel, IndexAllocator >::
 setSegmentsSizes( const SizesHolder& sizes )
 {
-   details::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
+   detail::CSR< Device, Index >::setSegmentsSizes( sizes, this->offsets );
    this->kernel.init( this->offsets );
 }
 
@@ -148,7 +148,7 @@ template< typename Device,
 __cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
+   return detail::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
@@ -168,7 +168,7 @@ template< typename Device,
 __cuda_callable__ auto CSR< Device, Index, Kernel, IndexAllocator >::
 getStorageSize() const -> IndexType
 {
-   return details::CSR< Device, Index >::getStorageSize( this->offsets );
+   return detail::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
index 58710a88355f8db94d6bd80d5c22985a76646164..640120f86cba0b515829587954df770cc7d0c01c 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h
@@ -14,10 +14,10 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel
 
    static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };
 
-   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
+   static int getSizeValueLog( const int& i ) { return detail::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
 
    static TNL::String getKernelType();
 
@@ -98,7 +98,7 @@ struct CSRAdaptiveKernel
       Index findLimit( const Index start,
                      const Offsets& offsets,
                      const Index size,
-                     details::Type &type,
+                     detail::Type &type,
                      size_t &sum );
 
       template< int SizeOfValue,
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
index d0217b57b546ad082dca96550258f8611cc04333..a510ac395687bcb5057a6c397fe6e5031a9f5c58 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp
@@ -14,9 +14,9 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -121,7 +121,7 @@ CSRAdaptiveKernel< Index, Device >::
 findLimit( const Index start,
            const Offsets& offsets,
            const Index size,
-           details::Type &type,
+           detail::Type &type,
            size_t &sum )
 {
    sum = 0;
@@ -129,24 +129,24 @@ findLimit( const Index start,
    {
       Index elements = offsets[ current + 1 ] - offsets[ current ];
       sum += elements;
-      if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
+      if( sum > detail::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
       {
          if( current - start > 0 ) // extra row
          {
-            type = details::Type::STREAM;
+            type = detail::Type::STREAM;
             return current;
          }
          else
          {                  // one long row
-            if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
-               type = details::Type::VECTOR;
+            if( sum <= 2 * detail::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
+               type = detail::Type::VECTOR;
             else
-               type = details::Type::LONG;
+               type = detail::Type::LONG;
             return current + 1;
          }
       }
    }
-   type = details::Type::STREAM;
+   type = detail::Type::STREAM;
    return size - 1; // return last row pointer
 }
 
@@ -165,22 +165,22 @@ initValueSize( const Offsets& offsets )
    size_t sum;
 
    // Fill blocks
-   std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
+   std::vector< detail::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
    inBlocks.reserve( rows );
 
    while( nextStart != rows - 1 )
    {
-      details::Type type;
+      detail::Type type;
       nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum );
-      if( type == details::Type::LONG )
+      if( type == detail::Type::LONG )
       {
          const Index blocksCount = inBlocks.size();
-         const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
+         const Index warpsPerCudaBlock = detail::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
          Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
          if( warpsLeft == 0 )
             warpsLeft = warpsPerCudaBlock;
          for( Index index = 0; index < warpsLeft; index++ )
-            inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
+            inBlocks.emplace_back( start, detail::Type::LONG, index, warpsLeft );
       }
       else
       {
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
index b81d360278b06219c8b3cf3ee18bdf09e8623406..9de407051b52609f124ffb5d07a6c0a4a364ea79 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h
@@ -11,8 +11,8 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -26,12 +26,12 @@ struct CSRAdaptiveKernelView
    using DeviceType = Device;
    using ViewType = CSRAdaptiveKernelView< Index, Device >;
    using ConstViewType = CSRAdaptiveKernelView< Index, Device >;
-   using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
+   using BlocksType = TNL::Containers::Vector< detail::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
    using BlocksView = typename BlocksType::ViewType;
 
-   static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;
+   static constexpr int MaxValueSizeLog = detail::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;
 
-   static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
+   static int getSizeValueLog( const int& i ) { return detail::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };
 
    CSRAdaptiveKernelView() = default;
 
diff --git a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
index 979a545240586c097936859ba2ac7a99c6efba0f..4f15608579de2c4d831111c4b0283a5a349b465e 100644
--- a/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp
@@ -14,11 +14,11 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
 #include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
-#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h>
+#include <TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -46,11 +46,11 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
                                     Real zero,
                                     Args... args )
 {
-   using BlockType = details::CSRAdaptiveKernelBlockDescriptor< Index >;
-   constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+   using BlockType = detail::CSRAdaptiveKernelBlockDescriptor< Index >;
+   constexpr int CudaBlockSize = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
    constexpr int WarpSize = Cuda::getWarpSize();
-   constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
-   constexpr size_t StreamedSharedElementsPerWarp  = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
+   constexpr int WarpsCount = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
+   constexpr size_t StreamedSharedElementsPerWarp  = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();
 
    __shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
    __shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
@@ -74,7 +74,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
    const Index firstSegmentIdx = block.getFirstSegment();
    const Index begin = offsets[ firstSegmentIdx ];
 
-   if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
+   if( block.getType() == detail::Type::STREAM ) // Stream kernel - many short segments per warp
    {
       const Index warpIdx = threadIdx.x / 32;
       const Index end = begin + block.getSize();
@@ -94,7 +94,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
          keep( i, result );
       }
    }
-   else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
+   else if( block.getType() == detail::Type::VECTOR ) // Vector kernel - one segment per warp
    {
       const Index end = begin + block.getSize();
       const Index segmentIdx = block.getFirstSegment();
@@ -181,7 +181,7 @@ template< typename Index,
           typename Reduction,
           typename ResultKeeper,
           bool DispatchScalarCSR =
-            details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
+            detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() ||
             std::is_same< Device, Devices::Host >::value >
 struct CSRAdaptiveKernelSegmentsReductionDispatcher;
 
@@ -237,7 +237,7 @@ struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduc
 
       Index blocksCount;
 
-      const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
+      const Index threads = detail::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
       constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();
 
       // Fill blocks
@@ -333,7 +333,7 @@ segmentsReduction( const OffsetsView& offsets,
 {
    int valueSizeLog = getSizeValueLog( sizeof( Real ) );
 
-   if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
+   if( detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() || valueSizeLog >= MaxValueSizeLog )
    {
       TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
          segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.h b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
index 9a8109c9705a99d0e04e2a3fe13c25869c037905..d3e48be1eeb7ab0fe386de91f7541292329cb406 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
index b4cc24a7355c474144300bbd41b76bab42d9ce2a..90505358e7246f350157d4ee90dfebd4b470c432 100644
--- a/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRHybridKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRHybridKernel.h>
 
 namespace TNL {
@@ -57,7 +57,7 @@ void segmentsReductionCSRHybridKernel(
     bool compute( true );
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += ThreadsPerSegment )
     {
-      aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+      aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
       localIdx += TNL::Cuda::getWarpSize();
     }
 
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.h b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
index 8a56d75d1b38a3e224176925ce5017f9a53d2e1a..c767083193c59abc770f38a8bb52abb3c4ac06a0 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
index 15f69667971ff2e404196babc957d0feb597a623..dd05fee201cd6360c65b1fd4311c5f9616a88c6f 100644
--- a/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRScalarKernel.hpp
@@ -15,7 +15,7 @@
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRScalarKernel.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -91,7 +91,7 @@ segmentsReduction( const OffsetsView& offsets,
         IndexType localIdx( 0 );
         bool compute( true );
         for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
         keeper( segmentIdx, aux );
     };
 
@@ -109,7 +109,7 @@ segmentsReduction( const OffsetsView& offsets,
             IndexType localIdx( 0 );
             bool compute( true );
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++  )
-                aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+                aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             keeper( segmentIdx, aux );
         }*/
     }
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.h b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
index 3163abb6029a116f627705cba86fff2593c6a1fe..074f15c5a35c8c096c52134bc7f6fbb5dd536bec 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.h
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.h
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
diff --git a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
index 2caf272c14fdd860e6e2e70288d77d88f39b46e8..847d1c355f1d775259b3291744bcae287144d7e4 100644
--- a/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
+++ b/src/TNL/Algorithms/Segments/CSRVectorKernel.hpp
@@ -14,7 +14,7 @@
 #include <TNL/Cuda/LaunchHelpers.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 #include <TNL/Algorithms/Segments/CSRVectorKernel.h>
 
 namespace TNL {
@@ -58,7 +58,7 @@ void segmentsReductionCSRKernelVector(
     for( Index globalIdx = offsets[ segmentIdx ] + localIdx; globalIdx < endIdx; globalIdx += TNL::Cuda::getWarpSize() )
     {
         TNL_ASSERT_LT( globalIdx, endIdx, "" );
-        aux = reduce( aux, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
+        aux = reduce( aux, detail::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, localIdx, globalIdx, compute ) );
         localIdx += TNL::Cuda::getWarpSize();
     }
 
diff --git a/src/TNL/Algorithms/Segments/CSRView.hpp b/src/TNL/Algorithms/Segments/CSRView.hpp
index bb40dc9f612282b22a4b374ae2ab936a62588f9e..8c9f1e78944698cc7bc4e74bb123915f0ac41730 100644
--- a/src/TNL/Algorithms/Segments/CSRView.hpp
+++ b/src/TNL/Algorithms/Segments/CSRView.hpp
@@ -13,8 +13,8 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/CSRView.h>
-#include <TNL/Algorithms/Segments/details/CSR.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/CSR.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -131,7 +131,7 @@ template< typename Device,
 __cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
+   return detail::CSR< Device, Index >::getSegmentSize( this->offsets, segmentIdx );
 }
 
 template< typename Device,
@@ -149,7 +149,7 @@ template< typename Device,
 __cuda_callable__ auto CSRView< Device, Index, Kernel >::
 getStorageSize() const -> IndexType
 {
-   return details::CSR< Device, Index >::getStorageSize( this->offsets );
+   return detail::CSR< Device, Index >::getStorageSize( this->offsets );
 }
 
 template< typename Device,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
index b6bdd5bf16d13c94137dbb161e7eb13877e91a31..5abb93b5a0ad2dd6027c46940c4b73cea6b0a227 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.h
@@ -36,7 +36,7 @@ class ChunkedEllpack
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
       using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< IndexType >, Organization >;
       using SegmentViewType = typename ViewType::SegmentViewType;
-      using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = typename ViewType::ChunkedEllpackSliceInfoType; // detail::ChunkedEllpackSliceInfo< IndexType >;
       //TODO: using ChunkedEllpackSliceInfoAllocator = typename IndexAllocatorType::retype< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoAllocator = typename ViewType::ChunkedEllpackSliceInfoAllocator; // typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = typename ViewType::ChunkedEllpackSliceInfoContainer; // Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
index 69e6b4c67f648ffbe1146dc38463682fd3c4079d..b4f60047bb128069d7c477ec43217cb6d2c8cc95 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpack.hpp
@@ -12,6 +12,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -37,7 +38,7 @@ ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 ChunkedEllpack( const ChunkedEllpack& chunkedEllpack )
    : size( chunkedEllpack.size ),
      storageSize( chunkedEllpack.storageSize ),
-     chunksInSlice( chunkedEllpack.chunksInSlice ), 
+     chunksInSlice( chunkedEllpack.chunksInSlice ),
      desiredChunkSize( chunkedEllpack.desiredChunkSize ),
      rowToChunkMapping( chunkedEllpack.rowToChunkMapping ),
      rowToSliceMapping( chunkedEllpack.rowToSliceMapping ),
@@ -273,7 +274,7 @@ setSegmentsSizes( const SizesHolder& segmentsSizes )
       this->storageSize = 0;
       for( IndexType sliceIndex = 0; sliceIndex < numberOfSlices; sliceIndex++ )
          this->setSlice( segmentsSizes, sliceIndex, storageSize );
-      this->rowPointers.scan();
+      inplaceInclusiveScan( this->rowPointers );
       IndexType chunksCount = this->numberOfSlices * this->chunksInSlice;
       this->chunksToSegmentsMapping.setSize( chunksCount );
       IndexType chunkIdx( 0 );
@@ -335,7 +336,7 @@ template< typename Device,
 auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
-   return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+   return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
       rowToSliceMapping.getView(),
       slices.getView(),
       rowToChunkMapping.getView(),
@@ -369,7 +370,7 @@ template< typename Device,
 __cuda_callable__ auto ChunkedEllpack< Device, Index, IndexAllocator, Organization >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
index 196c0764e1109ea66e433443c068553eed495dc7..f7211c21625fc147eb6db63a44298110b633e3aa 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.h
@@ -16,7 +16,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ElementsOrganization.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -39,7 +39,7 @@ class ChunkedEllpackView
       using ViewTemplate = ChunkedEllpackView< Device_, Index_, Organization >;
       using ConstViewType = ChunkedEllpackView< Device, std::add_const_t< Index >, Organization >;
       using SegmentViewType = ChunkedEllpackSegmentView< IndexType, Organization >;
-      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
@@ -230,7 +230,7 @@ class ChunkedEllpackView
                                                   Args_... args );
 
       template< typename Index_, typename Fetch_, bool B_ >
-      friend struct details::ChunkedEllpackSegmentsReductionDispatcher;
+      friend struct detail::ChunkedEllpackSegmentsReductionDispatcher;
 #endif
 };
       } // namespace Segments
diff --git a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
index 147b362d125bdad627bd58d728ab0a1573be9345..26e8fd0f75d64d2c70619c8ef79fe2a16bafcc14 100644
--- a/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/ChunkedEllpackView.hpp
@@ -13,8 +13,9 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
-//#include <TNL/Algorithms/Segments/details/ChunkedEllpack.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
+//#include <TNL/Algorithms/Segments/detail/ChunkedEllpack.h>
+#include <TNL/Cuda/SharedMemory.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -183,7 +184,7 @@ __cuda_callable__ auto ChunkedEllpackView< Device, Index, Organization >::
 getSegmentSize( const IndexType segmentIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -191,13 +192,13 @@ getSegmentSize( const IndexType segmentIdx ) const -> IndexType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSizeDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
          segmentIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentSize(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -231,7 +232,7 @@ __cuda_callable__ auto ChunkedEllpackView< Device, Index, Organization >::
 getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -241,7 +242,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndexDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -249,7 +250,7 @@ getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexTyp
          segmentIdx,
          localIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getGlobalIndex(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -269,7 +270,7 @@ ChunkedEllpackView< Device, Index, Organization >::
 getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
 {
    if( std::is_same< DeviceType, Devices::Host >::value )
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -278,14 +279,14 @@ getSegmentView( const IndexType segmentIdx ) const -> SegmentViewType
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef __CUDA_ARCH__
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentViewDirect(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
          chunksInSlice,
          segmentIdx );
 #else
-      return details::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentView(
+      return detail::ChunkedEllpack< IndexType, DeviceType, Organization >::getSegmentView(
          rowToSliceMapping,
          slices,
          rowToChunkMapping,
@@ -397,7 +398,7 @@ void
 ChunkedEllpackView< Device, Index, Organization >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( std::is_same< DeviceType, Devices::Host >::value )
    {
       //segmentsReductionKernel( 0, first, last, fetch, reduction, keeper, zero, args... );
@@ -425,7 +426,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
             IndexType end = begin + segmentSize;
             IndexType localIdx( 0 );
             for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx++ )
-               aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+               aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          }
          else
          {
@@ -435,7 +436,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
                IndexType end = begin + chunksInSlice * chunkSize;
                IndexType localIdx( 0 );
                for( IndexType globalIdx = begin; globalIdx < end && compute; globalIdx += chunksInSlice )
-                  aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+                  aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
             }
          }
          keeper( segmentIdx, aux );
@@ -455,7 +456,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
       {
          if( gridIdx == cudaGrids - 1 )
             cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
-         details::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
+         detail::ChunkedEllpackSegmentsReductionKernel< ViewType, IndexType, Fetch, Reduction, ResultKeeper, Real, Args...  >
             <<< cudaGridSize, cudaBlockSize, sharedMemory  >>>
             ( *this, gridIdx, first, last, fetch, reduction, keeper, zero, args... );
       }
@@ -566,7 +567,7 @@ segmentsReductionKernelWithAllParameters( IndexType gridIdx,
       return;
 
    RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   __shared__ detail::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
    if( threadIdx.x == 0 )
       sliceInfo = this->slices[ sliceIdx ];
    chunksResults[ threadIdx.x ] = zero;
@@ -644,7 +645,7 @@ segmentsReductionKernel( IndexType gridIdx,
       return;
 
    RealType* chunksResults = Cuda::getSharedMemory< RealType >();
-   __shared__ details::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
+   __shared__ detail::ChunkedEllpackSliceInfo< IndexType > sliceInfo;
 
    if( threadIdx.x == 0 )
       sliceInfo = this->slices[ sliceIdx ];
diff --git a/src/TNL/Algorithms/Segments/EllpackView.hpp b/src/TNL/Algorithms/Segments/EllpackView.hpp
index 724774b539d9c2d431bb601a1efcb2639dd8d5f9..6215f4ef971be08c63c23af0e985dd368a7d3e6f 100644
--- a/src/TNL/Algorithms/Segments/EllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/EllpackView.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/EllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -276,7 +276,7 @@ void EllpackView< Device, Index, Organization, Alignment >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    if( Organization == RowMajorOrder )
    {
       const IndexType segmentSize = this->segmentSize;
@@ -287,7 +287,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -303,7 +303,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType j = begin; j < end && compute; j += alignedSize  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, j, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
index 82e7a85711a192024b320312066200ea825c6662..4482cd567b704fcc5fdde99a33aa14be0491dabe 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpack.hpp
@@ -12,6 +12,7 @@
 
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 
@@ -152,7 +153,7 @@ setSegmentsSizes( const SizesHolder& sizes )
       slice_segment_size_view[ i ] = res;
    };
    ellpack.allReduction( fetch, reduce, keep, std::numeric_limits< IndexType >::min() );
-   this->sliceOffsets.template scan< Algorithms::ScanType::Exclusive >();
+   inplaceExclusiveScan( this->sliceOffsets );
    this->size = sum( sizes );
    this->alignedSize = this->sliceOffsets.getElement( slicesCount );
 }
diff --git a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
index 42fdae7ea488ddd426d6f8d084b924066f9aefd2..94bebca13412a0d16d7f4548495c8b075f93e51c 100644
--- a/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
+++ b/src/TNL/Algorithms/Segments/SlicedEllpackView.hpp
@@ -13,7 +13,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/ParallelFor.h>
 #include <TNL/Algorithms/Segments/SlicedEllpackView.h>
-#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
+#include <TNL/Algorithms/Segments/detail/LambdaAdapter.h>
 
 #include "SlicedEllpackView.h"
 
@@ -331,7 +331,7 @@ void
 SlicedEllpackView< Device, Index, Organization, SliceSize >::
 segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
 {
-   using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;
+   using RealType = typename detail::FetchLambdaAdapter< Index, Fetch >::ReturnType;
    //using RealType = decltype( fetch( IndexType(), IndexType(), IndexType(), std::declval< bool& >(), args... ) );
    const auto sliceSegmentSizes_view = this->sliceSegmentSizes.getConstView();
    const auto sliceOffsets_view = this->sliceOffsets.getConstView();
@@ -347,7 +347,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx< end; globalIdx++  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
@@ -364,7 +364,7 @@ segmentsReduction( IndexType first, IndexType last, Fetch& fetch, const Reductio
          IndexType localIdx( 0 );
          bool compute( true );
          for( IndexType globalIdx = begin; globalIdx < end; globalIdx += SliceSize  )
-            aux = reduction( aux, details::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
+            aux = reduction( aux, detail::FetchLambdaAdapter< IndexType, Fetch >::call( fetch, segmentIdx, localIdx++, globalIdx, compute ) );
          keeper( segmentIdx, aux );
       };
       Algorithms::ParallelFor< Device >::exec( first, last, l, args... );
diff --git a/src/TNL/Algorithms/Segments/details/BiEllpack.h b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
similarity index 98%
rename from src/TNL/Algorithms/Segments/details/BiEllpack.h
rename to src/TNL/Algorithms/Segments/detail/BiEllpack.h
index 29551eb1deb9314a13aa50749174b18093401140..a45e16d779273fb63aed38ddecedce5c2989a657 100644
--- a/src/TNL/Algorithms/Segments/details/BiEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/BiEllpack.h
@@ -13,12 +13,12 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/BiEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/CheckLambdas.h>
+#include <TNL/Algorithms/Segments/detail/CheckLambdas.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Device,
@@ -292,7 +292,7 @@ template< typename Index,
           typename Fetch,
           int BlockDim = 256,
           int WarpSize = 32,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct BiEllpackSegmentsReductionDispatcher{};
 
 template< typename Index, typename Fetch, int BlockDim, int WarpSize >
@@ -364,7 +364,7 @@ void BiEllpackSegmentsReductionKernel( View biEllpack,
 }
 #endif
 
-         } //namespace details
+         } //namespace detail
       } //namespace Segments
    } //namespace Algorithms
 } //namepsace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSR.h b/src/TNL/Algorithms/Segments/detail/CSR.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/details/CSR.h
rename to src/TNL/Algorithms/Segments/detail/CSR.h
index b9392815db770e502f29dcc4ea9a6f07a8b269eb..e43a97b671757586449ee76b811872e142209252 100644
--- a/src/TNL/Algorithms/Segments/details/CSR.h
+++ b/src/TNL/Algorithms/Segments/detail/CSR.h
@@ -10,11 +10,12 @@
 
 #pragma once
 
+#include <TNL/Algorithms/scan.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Device,
           typename Index >
@@ -35,7 +36,7 @@ class CSR
             view = sizes;
          }
          offsets.setElement( sizes.getSize(), 0 );
-         offsets.template scan< Algorithms::ScanType::Exclusive >();
+         inplaceExclusiveScan( offsets );
       }
 
       template< typename CSROffsets >
@@ -109,7 +110,7 @@ class CSR
       template< typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args >
       void allReduction( Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const;
 };
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
similarity index 99%
rename from src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
rename to src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
index d2be8966453c9d1253720925cfea44545bfbbb96..83faa105d198be0a0d5e97cd3d550a085cce2818 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelBlockDescriptor.h
@@ -13,7 +13,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 enum class Type {
    /* LONG = 0!!! Non zero value rewrites index[1] */
@@ -245,7 +245,7 @@ std::ostream& operator<< ( std::ostream& str, const CSRAdaptiveKernelBlockDescri
    block.print( str );
    return str;
 }
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
similarity index 98%
rename from src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
rename to src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
index 843f2f7d52d56e9aab89fcc63b06b1b1f936384b..f11668c2d348371917e04518124f2bea846f8fae 100644
--- a/src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h
+++ b/src/TNL/Algorithms/Segments/detail/CSRAdaptiveKernelParameters.h
@@ -13,7 +13,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 // This can be used for tunning the number of CUDA threads per block depending on the size of Value
 // TODO: Perform some tests
@@ -106,7 +106,7 @@ getSizeValueLogConstexpr( const int i )
    return 6;
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/CheckLambdas.h b/src/TNL/Algorithms/Segments/detail/CheckLambdas.h
similarity index 94%
rename from src/TNL/Algorithms/Segments/details/CheckLambdas.h
rename to src/TNL/Algorithms/Segments/detail/CheckLambdas.h
index a9b6d672b63503cdaca841189bbe95d3f5de0a8d..11944f9481fa47e02f00a01639ba8a7230f1e8d6 100644
--- a/src/TNL/Algorithms/Segments/details/CheckLambdas.h
+++ b/src/TNL/Algorithms/Segments/detail/CheckLambdas.h
@@ -14,7 +14,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Lambda >
@@ -34,7 +34,7 @@ class CheckFetchLambda
       static constexpr bool hasAllParameters() { return value; };
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Segments/details/ChunkedEllpack.h b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
similarity index 97%
rename from src/TNL/Algorithms/Segments/details/ChunkedEllpack.h
rename to src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
index 41e4ca4158d45ed3945360aaf55e35adacd65b72..5f47b0cafc9ac77e4b5d7257b74ad05b040e3651 100644
--- a/src/TNL/Algorithms/Segments/details/ChunkedEllpack.h
+++ b/src/TNL/Algorithms/Segments/detail/ChunkedEllpack.h
@@ -13,12 +13,12 @@
 #include <type_traits>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Segments/ChunkedEllpackSegmentView.h>
-#include <TNL/Algorithms/Segments/details/CheckLambdas.h>
+#include <TNL/Algorithms/Segments/detail/CheckLambdas.h>
 
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 /***
  * In the ChunkedEllpack, the segments are split into slices. This is done
@@ -65,7 +65,7 @@ class ChunkedEllpack
       using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType >;
       using OffsetsHolderView = typename OffsetsHolder::ViewType;
       using SegmentsSizes = OffsetsHolder;
-      using ChunkedEllpackSliceInfoType = details::ChunkedEllpackSliceInfo< IndexType >;
+      using ChunkedEllpackSliceInfoType = detail::ChunkedEllpackSliceInfo< IndexType >;
       using ChunkedEllpackSliceInfoAllocator = typename Allocators::Default< Device >::template Allocator< ChunkedEllpackSliceInfoType >;
       using ChunkedEllpackSliceInfoContainer = Containers::Array< ChunkedEllpackSliceInfoType, DeviceType, IndexType, ChunkedEllpackSliceInfoAllocator >;
       using ChunkedEllpackSliceInfoContainerView = typename ChunkedEllpackSliceInfoContainer::ViewType;
@@ -233,7 +233,7 @@ class ChunkedEllpack
 #ifdef HAVE_CUDA
 template< typename Index,
           typename Fetch,
-          bool HasAllParameters = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
+          bool HasAllParameters = detail::CheckFetchLambda< Index, Fetch >::hasAllParameters() >
 struct ChunkedEllpackSegmentsReductionDispatcher{};
 
 template< typename Index, typename Fetch >
@@ -304,7 +304,7 @@ void ChunkedEllpackSegmentsReductionKernel( View chunkedEllpack,
 }
 #endif
 
-         } //namespace details
+         } //namespace detail
       } //namespace Segments
    } //namespace Algorithms
 } //namepsace TNL
diff --git a/src/TNL/Algorithms/Segments/details/LambdaAdapter.h b/src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
similarity index 96%
rename from src/TNL/Algorithms/Segments/details/LambdaAdapter.h
rename to src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
index e4d8871c5484bab3abb7c85abf29eea5946ac50c..a46acba8febec3904898a7014cc8cda634b0e31a 100644
--- a/src/TNL/Algorithms/Segments/details/LambdaAdapter.h
+++ b/src/TNL/Algorithms/Segments/detail/LambdaAdapter.h
@@ -15,7 +15,7 @@
 namespace TNL {
    namespace Algorithms {
       namespace Segments {
-         namespace details {
+         namespace detail {
 
 template< typename Index,
           typename Lambda,
@@ -50,7 +50,7 @@ struct FetchLambdaAdapter< Index, Lambda, false >
    }
 };
 
-         } // namespace details
+         } // namespace detail
       } // namespace Segments
    }  // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
index 0b97bd7c698cc79ad7733e0e3487a09e80c5d3c3..0a52b52fa772c7f4216d69fd01a2c0ad1680f352 100644
--- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.h
@@ -13,7 +13,6 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/Vector.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
 
 namespace TNL {
@@ -94,7 +93,7 @@ class Quicksorter< Value, Devices::Cuda >
       Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount;  //is in reality 1 integer each
 
       Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping;
-      Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem;
+      Containers::Array<int, Devices::Cuda> cuda_reductionTaskInitMem;
 
       int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0;
       int iteration = 0;
diff --git a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
index da3f3a4fbd002577077c9882b1f4864161ca5d67..b775c9b77074ff4b02fb395af4c8d401d0222009 100644
--- a/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
+++ b/src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp
@@ -17,7 +17,8 @@
 #include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h>
 #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
 #include <TNL/Algorithms/Sorting/detail/Quicksorter.h>
-#include <TNL/Algorithms/Scan.h>
+#include <TNL/Algorithms/reduce.h>
+#include <TNL/Algorithms/scan.h>
 
 namespace TNL {
     namespace Algorithms {
@@ -314,8 +315,7 @@ int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::C
         int size = task.partitionEnd - task.partitionBegin;
         return size / elemPerBlock + (size % elemPerBlock != 0);
     };
-    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
-    return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 );
+    return reduce< Devices::Cuda >( 0, quicksort.host_1stPhaseTasksAmount, fetch, TNL::Plus{} );
 }
 
 template< typename Value >
@@ -323,14 +323,6 @@ int
 Quicksorter< Value, Devices::Cuda >::
 getSetsNeeded(int elemPerBlock) const
 {
-    /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
-    auto fetch = [=] __cuda_callable__(int i) {
-        const auto &task = view[i];
-        int size = task.partitionEnd - task.partitionBegin;
-        return size / elemPerBlock + (size % elemPerBlock != 0);
-    };
-    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
-    return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/
     return getSetsNeededFunction< Value >( elemPerBlock, *this );
 }
 
@@ -372,10 +364,7 @@ initTasks(int elemPerBlock, const CMP &Cmp)
                                                       cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
     //cuda_reductionTaskInitMem[i] == how many blocks task i needs
 
-    //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; };
-
-    Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >::
-        perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0);
+    inplaceInclusiveScan(cuda_reductionTaskInitMem);
     //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need
 
     int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
diff --git a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
index a6afaa20a33a0c687a6814bcfee92546e38086b7..5277cc4d336e87d54929c92d81b31a4dea83e1d1 100644
--- a/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
+++ b/src/TNL/Algorithms/Sorting/detail/cudaPartition.h
@@ -13,8 +13,8 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 
 namespace TNL {
    namespace Algorithms {
@@ -185,8 +185,11 @@ __device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src,
     int smaller = 0, bigger = 0;
     countElem(srcView, Cmp, smaller, bigger, pivot);
 
-    int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
-    int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+    //synchronization is in this function already
+    using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
+    __shared__ typename BlockScan::Storage storage;
+    int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
+    int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );
 
     if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values
     {
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
index efca29f2429fd54b852384793d67538b4ab0356d..0ed8efa4d4b10b98e0579b15e0a44a29dad72622 100644
--- a/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h
@@ -15,8 +15,7 @@
 #include <TNL/Containers/Array.h>
 #include "cassert"
 #include <TNL/Algorithms/Sorting/detail/bitonicSort.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
-#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
+#include <TNL/Algorithms/detail/CudaScanKernel.h>
 
 namespace TNL {
     namespace Algorithms {
@@ -134,8 +133,10 @@ __device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices:
         countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot);
 
         //synchronization is in this function already
-        int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
-        int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
+        using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
+        __shared__ typename BlockScan::Storage storage;
+        int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
+        int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );
 
         if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src
         {
diff --git a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
index 8d26d0637bd55c12751639b36d288e890debf00e..555e6c538f01a87a000507bda45f1ff92b4dcc09 100644
--- a/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
+++ b/src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h
@@ -13,8 +13,6 @@
 #pragma once
 
 #include <TNL/Containers/Array.h>
-#include <TNL/Containers/Vector.h>
-#include <TNL/Algorithms/Sorting/detail/reduction.h>
 #include <TNL/Algorithms/Sorting/detail/task.h>
 #include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
 #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
@@ -33,7 +31,7 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n
 //-----------------------------------------------------------
 
 __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
-                                     Containers::VectorView<int, Devices::Cuda> blocksNeeded)
+                                     Containers::ArrayView<int, Devices::Cuda> blocksNeeded)
 {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     if (i >= cuda_tasks.getSize())
@@ -49,7 +47,7 @@ __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda>
 template <typename Value, typename CMP>
 __global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks,
                              Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
-                             Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem,
+                             Containers::ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
                              Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp)
 {
     if (blockIdx.x >= cuda_tasks.getSize())
diff --git a/src/TNL/Algorithms/Sorting/detail/reduction.h b/src/TNL/Algorithms/Sorting/detail/reduction.h
deleted file mode 100644
index e2bf148099e830bae96264ab4cda5faa5b964181..0000000000000000000000000000000000000000
--- a/src/TNL/Algorithms/Sorting/detail/reduction.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/***************************************************************************
-                          reduction.h  -  description
-                             -------------------
-    begin                : Jul 13, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Xuan Thang Nguyen
-
-#pragma once
-
-namespace TNL {
-    namespace Algorithms {
-        namespace Sorting {
-
-#ifdef HAVE_CUDA
-
-/**
- * https://developer.nvidia.com/blog/faster-parallel-reductions-kepler/
- * */
-
-
-__device__ int warpReduceSum(int initVal)
-{
-    const unsigned int maskConstant = 0xffffffff; //not used
-    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
-        initVal += __shfl_xor_sync(maskConstant, initVal, mask);
-
-    return initVal;
-}
-
-__device__ int blockReduceSum(int val)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    val = warpReduceSum(val);
-
-    if (lane == 0)
-        shared[wid] = val;
-    __syncthreads();
-
-    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
-
-    if (wid == 0)
-        val = warpReduceSum(val);
-
-    if(threadIdx.x == 0)
-        shared[0] = val;
-    __syncthreads();
-
-    return shared[0];
-}
-
-//-------------------------------------------------------------------------------
-
-__device__ int warpInclusivePrefixSum(int value)
-{
-    int laneId = threadIdx.x & (32-1);
-
-    #pragma unroll
-    for (int i = 1; i*2 <= 32; i *= 2)//32 here is warp size
-    {
-        int n = __shfl_up_sync(0xffffffff, value, i);
-        if ((laneId & (warpSize - 1)) >= i)
-            value += n;
-    }
-
-    return value;
-}
-
-__device__ int blockInclusivePrefixSum(int value)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    int tmp = warpInclusivePrefixSum(value);
-
-    if (lane == warpSize-1)
-        shared[wid] = tmp;
-    __syncthreads();
-
-    int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
-    if (wid == 0)
-        shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2;
-    __syncthreads();
-
-    tmp += shared[wid];
-    return tmp;
-}
-
-//--------------------------------------------------------------------
-
-template<typename Operator>
-__device__ int warpCmpReduce(int initVal, const Operator & Cmp)
-{
-    const unsigned int maskConstant = 0xffffffff; //not used
-    for (unsigned int mask = warpSize / 2; mask > 0; mask >>= 1)
-        initVal = Cmp(initVal, __shfl_xor_sync(maskConstant, initVal, mask));
-
-    return initVal;
-}
-
-template<typename Operator>
-__device__ int blockCmpReduce(int val, const Operator & Cmp)
-{
-    static __shared__ int shared[32];
-    int lane = threadIdx.x & (warpSize - 1);
-    int wid = threadIdx.x / warpSize;
-
-    val = warpCmpReduce(val, Cmp);
-
-    if (lane == 0)
-        shared[wid] = val;
-    __syncthreads();
-
-    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : shared[0];
-
-    if (wid == 0)
-        val = warpCmpReduce(val, Cmp);
-
-    if(threadIdx.x == 0)
-        shared[0] = val;
-    __syncthreads();
-
-    return shared[0];
-}
-
-#endif
-
-        } // namespace Sorting
-    } // namespace Algorithms
-} // namespace TNL
\ No newline at end of file
diff --git a/src/TNL/Algorithms/contains.h b/src/TNL/Algorithms/contains.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3a2524f22b633a49f206d6bfaffb4e8459a019b
--- /dev/null
+++ b/src/TNL/Algorithms/contains.h
@@ -0,0 +1,75 @@
+/***************************************************************************
+                          contains.h  -  description
+                             -------------------
+    begin                : Jul 27, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Algorithms/detail/Contains.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Checks if an array/vector/view contains an element with given value.
+ *
+ * By default, all elements of the array are checked. If \e begin or \e end is
+ * set to a non-zero value, only elements in the sub-interval `[begin, end)` are
+ * checked.
+ *
+ * \param array The array to be searched.
+ * \param value The value to be checked.
+ * \param begin The beginning of the array sub-interval. It is 0 by default.
+ * \param end The end of the array sub-interval. The default value is 0 which
+ *            is, however, replaced with the array size.
+ * \return `true` if there is _at least one_ element in the sub-interval
+ *         `[begin, end)` which has the value \e value.
+ */
+template< typename Array >
+bool
+contains( const Array& array,
+          typename Array::ValueType value,
+          typename Array::IndexType begin = 0,
+          typename Array::IndexType end = 0 )
+{
+   TNL_ASSERT_TRUE( array.getData(), "Attempted to check a value of an empty array." );
+   if( end == 0 )
+      end = array.getSize();
+   return detail::Contains< typename Array::DeviceType >()( array.getData() + begin, end - begin, value );
+}
+
+/**
+ * \brief Checks if all elements of an array/vector/view have the given value.
+ *
+ * By default, all elements of the array are checked. If \e begin or \e end is
+ * set to a non-zero value, only elements in the sub-interval `[begin, end)` are
+ * checked.
+ *
+ * \param array The array to be searched.
+ * \param value The value to be checked.
+ * \param begin The beginning of the array sub-interval. It is 0 by default.
+ * \param end The end of the array sub-interval. The default value is 0 which
+ *            is, however, replaced with the array size.
+ * \return `true` if _all_ elements in the sub-interval `[begin, end)` have the
+ *         same value \e value.
+ */
+template< typename Array >
+bool
+containsOnlyValue( const Array& array,
+                   typename Array::ValueType value,
+                   typename Array::IndexType begin = 0,
+                   typename Array::IndexType end = 0 )
+{
+   TNL_ASSERT_TRUE( array.getData(), "Attempted to check a value of an empty array." );
+   if( end == 0 )
+      end = array.getSize();
+   return detail::ContainsOnlyValue< typename Array::DeviceType >()( array.getData() + begin, end - begin, value );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/detail/Contains.h b/src/TNL/Algorithms/detail/Contains.h
new file mode 100644
index 0000000000000000000000000000000000000000..77a191b30aa1bc8a55a655f550a6b6ae992b5ebc
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Contains.h
@@ -0,0 +1,160 @@
+/***************************************************************************
+                          Contains.h  -  description
+                             -------------------
+    begin                : Jul 27, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Cuda/CudaCallable.h>
+#include <TNL/Algorithms/reduce.h>
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+template< typename Device >
+struct Contains;
+
+template< typename Device >
+struct ContainsOnlyValue;
+
+
+template<>
+struct Contains< Devices::Sequential >
+{
+   template< typename Element,
+             typename Index >
+   __cuda_callable__
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      for( Index i = 0; i < size; i++ )
+         if( data[ i ] == value )
+            return true;
+      return false;
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Sequential >
+{
+   template< typename Element,
+             typename Index >
+   __cuda_callable__
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      for( Index i = 0; i < size; i++ )
+         if( ! ( data[ i ] == value ) )
+            return false;
+      return true;
+   }
+};
+
+
+template<>
+struct Contains< Devices::Host >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+         auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
+         return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+      }
+      else {
+         // sequential algorithm can return as soon as it finds a match
+         return Contains< Devices::Sequential >{}( data, size, value );
+      }
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Host >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
+         auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
+         return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      }
+      else {
+         // sequential algorithm can return as soon as it finds a mismatch
+         return ContainsOnlyValue< Devices::Sequential >{}( data, size, value );
+      }
+   }
+};
+
+
+template<>
+struct Contains< Devices::Cuda >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, (Index) 0, "" );
+
+      auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+      return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+   }
+};
+
+template<>
+struct ContainsOnlyValue< Devices::Cuda >
+{
+   template< typename Element,
+             typename Index >
+   bool operator()( const Element* data,
+                    const Index size,
+                    const Element& value )
+   {
+      if( size == 0 ) return false;
+      TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
+      TNL_ASSERT_GE( size, 0, "" );
+
+      auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+      return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   }
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/detail/CudaMultireductionKernel.h b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
index 973d8e958c66b29b49d46419020ee938c2205143..c40d0602efb7e691af5d2007fbf42c377dcdfdcc 100644
--- a/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaMultireductionKernel.h
@@ -47,7 +47,7 @@ template< int blockSizeX,
           typename Index >
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
-CudaMultireductionKernel( const Result zero,
+CudaMultireductionKernel( const Result identity,
                           DataFetcher dataFetcher,
                           const Reduction reduction,
                           const Index size,
@@ -65,7 +65,7 @@ CudaMultireductionKernel( const Result zero,
    const int y = blockIdx.y * blockDim.y + threadIdx.y;
    if( y >= n ) return;
 
-   sdata[ tid ] = zero;
+   sdata[ tid ] = identity;
 
    // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSizeX < size ) {
@@ -145,7 +145,7 @@ template< typename Result,
           typename Reduction,
           typename Index >
 int
-CudaMultireductionKernelLauncher( const Result zero,
+CudaMultireductionKernelLauncher( const Result identity,
                                   DataFetcher dataFetcher,
                                   const Reduction reduction,
                                   const Index size,
@@ -217,55 +217,55 @@ CudaMultireductionKernelLauncher( const Result zero,
    {
       case 512:
          CudaMultireductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case 256:
          cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case 128:
          cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  64:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  32:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case  16:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
      case   8:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case   4:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
         break;
       case   2:
          cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( identity, dataFetcher, reduction, size, n, output );
          break;
       case   1:
          throw std::logic_error( "blockSize should not be 1." );
diff --git a/src/TNL/Algorithms/detail/CudaReductionKernel.h b/src/TNL/Algorithms/detail/CudaReductionKernel.h
index 51f38f18bee4b8645d720538d190bee04b59e265..c08c686eb3d42704f4cd53d858fb7447e1ace845 100644
--- a/src/TNL/Algorithms/detail/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/detail/CudaReductionKernel.h
@@ -23,6 +23,336 @@ namespace TNL {
 namespace Algorithms {
 namespace detail {
 
+#ifdef HAVE_CUDA
+/* Template for cooperative reduction across the CUDA block of threads.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
+template< int blockSize,
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockReduce
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      // when there is only one warp per blockSize.x, we need to allocate two warps
+      // worth of shared memory so that we don't index shared memory out of bounds
+      ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+   };
+
+   /* Cooperative reduction across the CUDA block - each thread will get the
+    * result of the reduction
+    *
+    * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   ValueType
+   reduce( const Reduction& reduction,
+           ValueType identity,
+           ValueType threadValue,
+           int tid,
+           Storage& storage )
+   {
+      storage.data[ tid ] = threadValue;
+      __syncthreads();
+
+      if( blockSize >= 1024 ) {
+         if( tid < 512 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 512 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 512 ) {
+         if( tid < 256 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 256 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 256 ) {
+         if( tid < 128 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 128 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 128 ) {
+         if( tid <  64 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 64 ] );
+         __syncthreads();
+      }
+
+      // This runs in one warp so we use __syncwarp() instead of __syncthreads().
+      if( tid < 32 ) {
+         if( blockSize >= 64 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 32 ] );
+         __syncwarp();
+         // Note that here we do not have to check if tid < 16 etc, because we have
+         // 2 * blockSize.x elements of shared memory per block, so we do not
+         // access out of bounds. The results for the upper half will be undefined,
+         // but unused anyway.
+         if( blockSize >= 32 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 16 ] );
+         __syncwarp();
+         if( blockSize >= 16 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 8 ] );
+         __syncwarp();
+         if( blockSize >=  8 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 4 ] );
+         __syncwarp();
+         if( blockSize >=  4 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 2 ] );
+         __syncwarp();
+         if( blockSize >=  2 )
+            storage.data[ tid ] = reduction( storage.data[ tid ], storage.data[ tid + 1 ] );
+      }
+
+      __syncthreads();
+      return storage.data[ 0 ];
+   }
+};
+
+template< int blockSize,
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockReduceShfl
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
+
+   /* Cooperative reduction across the CUDA block - each thread will get the
+    * result of the reduction
+    *
+    * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   ValueType
+   reduce( const Reduction& reduction,
+           ValueType identity,
+           ValueType threadValue,
+           int tid,
+           Storage& storage )
+   {
+      // verify the configuration
+      static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
+                     "blockSize is too large, it would not be possible to reduce warpResults using one warp" );
+
+      int lane_id = threadIdx.x % warpSize;
+      int warp_id = threadIdx.x / warpSize;
+
+      // perform the parallel reduction across warps
+      threadValue = warpReduce( reduction, threadValue );
+
+      // the first thread of each warp writes the result into the shared memory
+      if( lane_id == 0 )
+         storage.warpResults[ warp_id ] = threadValue;
+      __syncthreads();
+
+      // the first warp performs the final reduction
+      if( warp_id == 0 ) {
+         // read from shared memory only if that warp existed
+         if( tid < blockSize / Cuda::getWarpSize() )
+            threadValue = storage.warpResults[ lane_id ];
+         else
+            threadValue = identity;
+         threadValue = warpReduce( reduction, threadValue );
+      }
+
+      // the first thread writes the result into the shared memory
+      if( tid == 0 )
+         storage.warpResults[ 0 ] = threadValue;
+
+      __syncthreads();
+      return storage.warpResults[ 0 ];
+   }
+
+   /* Helper function.
+    * Cooperative reduction across the warp - each thread will get the result
+    * of the reduction
+    */
+   __device__ static
+   ValueType
+   warpReduce( const Reduction& reduction,
+               ValueType threadValue )
+   {
+      constexpr unsigned mask = 0xffffffff;
+      #pragma unroll
+      for( int i = Cuda::getWarpSize() / 2; i > 0; i /= 2 ) {
+         const ValueType otherValue = __shfl_xor_sync( mask, threadValue, i );
+         threadValue = reduction( threadValue, otherValue );
+      }
+      return threadValue;
+   }
+};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, int >
+: public CudaBlockReduceShfl< blockSize, Reduction, int >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned int >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned int >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, long >
+: public CudaBlockReduceShfl< blockSize, Reduction, long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned long >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, long long >
+: public CudaBlockReduceShfl< blockSize, Reduction, long long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, unsigned long long >
+: public CudaBlockReduceShfl< blockSize, Reduction, unsigned long long >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, float >
+: public CudaBlockReduceShfl< blockSize, Reduction, float >
+{};
+
+template< int blockSize,
+          typename Reduction >
+struct CudaBlockReduce< blockSize, Reduction, double >
+: public CudaBlockReduceShfl< blockSize, Reduction, double >
+{};
+
+/* Template for cooperative reduction with argument across the CUDA block of
+ * threads. It is a *cooperative* operation - all threads must call the
+ * operation, otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
+template< int blockSize,
+          typename Reduction,
+          typename ValueType,
+          typename IndexType >
+struct CudaBlockReduceWithArgument
+{
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      // when there is only one warp per blockSize.x, we need to allocate two warps
+      // worth of shared memory so that we don't index shared memory out of bounds
+      ValueType data[ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+      IndexType idx [ (blockSize <= 32) ? 2 * blockSize : blockSize ];
+   };
+
+   /* Cooperative reduction with argument across the CUDA block - each thread
+    * will get the pair of the result of the reduction and the index
+    *
+    * \param reduction   The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue Value of the calling thread to be reduced.
+    * \param threadIndex Index value of the calling thread to be reduced.
+    * \param tid         Index of the calling thread (usually `threadIdx.x`,
+    *                    unless you know what you are doing).
+    * \param storage     Auxiliary storage (must be allocated as a __shared__
+    *                    variable).
+    */
+   __device__ static
+   std::pair< ValueType, IndexType >
+   reduceWithArgument( const Reduction& reduction,
+                       ValueType identity,
+                       ValueType threadValue,
+                       IndexType threadIndex,
+                       int tid,
+                       Storage& storage )
+   {
+      storage.data[ tid ] = threadValue;
+      storage.idx[ tid ] = threadIndex;
+      __syncthreads();
+
+      if( blockSize >= 1024 ) {
+         if( tid < 512 )
+            reduction( storage.data[ tid ], storage.data[ tid + 512 ], storage.idx[ tid ], storage.idx[ tid + 512 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 512 ) {
+         if( tid < 256 )
+            reduction( storage.data[ tid ], storage.data[ tid + 256 ], storage.idx[ tid ], storage.idx[ tid + 256 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 256 ) {
+         if( tid < 128 )
+            reduction( storage.data[ tid ], storage.data[ tid + 128 ], storage.idx[ tid ], storage.idx[ tid + 128 ] );
+         __syncthreads();
+      }
+      if( blockSize >= 128 ) {
+         if( tid <  64 )
+            reduction( storage.data[ tid ], storage.data[ tid + 64 ], storage.idx[ tid ], storage.idx[ tid + 64 ] );
+         __syncthreads();
+      }
+
+      // This runs in one warp so we use __syncwarp() instead of __syncthreads().
+      if( tid < 32 ) {
+         if( blockSize >= 64 )
+            reduction( storage.data[ tid ], storage.data[ tid + 32 ], storage.idx[ tid ], storage.idx[ tid + 32 ] );
+         __syncwarp();
+         // Note that here we do not have to check if tid < 16 etc, because we have
+         // 2 * blockSize.x elements of shared memory per block, so we do not
+         // access out of bounds. The results for the upper half will be undefined,
+         // but unused anyway.
+         if( blockSize >= 32 )
+            reduction( storage.data[ tid ], storage.data[ tid + 16 ], storage.idx[ tid ], storage.idx[ tid + 16 ] );
+         __syncwarp();
+         if( blockSize >= 16 )
+            reduction( storage.data[ tid ], storage.data[ tid + 8 ], storage.idx[ tid ], storage.idx[ tid + 8 ] );
+         __syncwarp();
+         if( blockSize >=  8 )
+            reduction( storage.data[ tid ], storage.data[ tid + 4 ], storage.idx[ tid ], storage.idx[ tid + 4 ] );
+         __syncwarp();
+         if( blockSize >=  4 )
+            reduction( storage.data[ tid ], storage.data[ tid + 2 ], storage.idx[ tid ], storage.idx[ tid + 2 ] );
+         __syncwarp();
+         if( blockSize >=  2 )
+            reduction( storage.data[ tid ], storage.data[ tid + 1 ], storage.idx[ tid ], storage.idx[ tid + 1 ] );
+      }
+
+      __syncthreads();
+      return std::make_pair( storage.data[ 0 ], storage.idx[ 0 ] );
+   }
+};
+#endif
+
 /****
  * The performance of this kernel is very sensitive to register usage.
  * Compile with --ptxas-options=-v and configure these constants for given
@@ -40,260 +370,160 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
    static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
 #endif
 
-/*
- * nvcc (as of 10.2) is totally fucked up, in some cases it does not recognize the
- * std::plus<void>::operator() function to be constexpr and hence __host__ __device__
- * (for example, when the arguments are StaticVector<3, double> etc). Hence, we use
- * this wrapper which triggers only a warning and not an error as is the case when
- * the reduction functor is called from a __global__ or __device__ function. Let's
- * hope it works otherwise...
- */
-template< typename Reduction, typename Arg1, typename Arg2 >
-__host__ __device__
-auto CudaReductionFunctorWrapper( Reduction&& reduction, Arg1&& arg1, Arg2&& arg2 )
-{
-// let's suppress the aforementioned warning...
-#ifdef __NVCC__
-#pragma push
-#pragma diag_suppress 2979  // error number for nvcc 10.2
-#pragma diag_suppress 3123  // error number for nvcc 11.1
-#endif
-   return std::forward<Reduction>(reduction)( std::forward<Arg1>(arg1), std::forward<Arg2>(arg2) );
-#ifdef __NVCC__
-#pragma pop
-#endif
-}
-
 template< int blockSize,
-          typename Result,
           typename DataFetcher,
           typename Reduction,
+          typename Result,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( const Result zero,
-                     DataFetcher dataFetcher,
+CudaReductionKernel( DataFetcher dataFetcher,
                      const Reduction reduction,
-                     const Index begin,
-                     const Index end,
+                     Result identity,
+                     Index begin,
+                     Index end,
                      Result* output )
 {
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
-   __shared__ Result sdata[shmemElements];
-
-   // Get the thread id (tid), global thread id (gid) and gridSize.
-   const Index tid = threadIdx.x;
-         Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
-   const Index gridSize = blockDim.x * gridDim.x;
 
-   sdata[ tid ] = zero;
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduce< blockSize, Reduction, Result >;
+   union Shared {
+      typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+
+   // Calculate the grid size (stride of the sequential reduction loop).
+   const Index gridSize = blockDim.x * gridDim.x;
+   // Shift the input lower bound by the thread index in the grid.
+   begin += blockIdx.x * blockDim.x + threadIdx.x;
 
    // Start with the sequential reduction and push the result into the shared memory.
-   while( gid + 4 * gridSize < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
-      gid += 4 * gridSize;
+   Result result = identity;
+   while( begin + 4 * gridSize < end ) {
+      result = reduction( result, dataFetcher( begin ) );
+      result = reduction( result, dataFetcher( begin + gridSize ) );
+      result = reduction( result, dataFetcher( begin + 2 * gridSize ) );
+      result = reduction( result, dataFetcher( begin + 3 * gridSize ) );
+      begin += 4 * gridSize;
    }
-   while( gid + 2 * gridSize < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid + gridSize ) );
-      gid += 2 * gridSize;
+   while( begin + 2 * gridSize < end ) {
+      result = reduction( result, dataFetcher( begin ) );
+      result = reduction( result, dataFetcher( begin + gridSize ) );
+      begin += 2 * gridSize;
    }
-   while( gid < end ) {
-      sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], dataFetcher( gid ) );
-      gid += gridSize;
+   while( begin < end ) {
+      result = reduction( result, dataFetcher( begin ) );
+      begin += gridSize;
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   if( blockSize >= 1024 ) {
-      if( tid < 512 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 512 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 512 ) {
-      if( tid < 256 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 256 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 256 ) {
-      if( tid < 128 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 128 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 128 ) {
-      if( tid <  64 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 64 ] );
-      __syncthreads();
-   }
-
-   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
-   if( tid < 32 ) {
-      if( blockSize >= 64 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 32 ] );
-      __syncwarp();
-      // Note that here we do not have to check if tid < 16 etc, because we have
-      // 2 * blockSize.x elements of shared memory per block, so we do not
-      // access out of bounds. The results for the upper half will be undefined,
-      // but unused anyway.
-      if( blockSize >= 32 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 16 ] );
-      __syncwarp();
-      if( blockSize >= 16 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 8 ] );
-      __syncwarp();
-      if( blockSize >=  8 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 4 ] );
-      __syncwarp();
-      if( blockSize >=  4 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 2 ] );
-      __syncwarp();
-      if( blockSize >=  2 )
-         sdata[ tid ] = CudaReductionFunctorWrapper( reduction, sdata[ tid ], sdata[ tid + 1 ] );
-   }
+   result = BlockReduce::reduce( reduction, identity, result, threadIdx.x, storage.blockReduceStorage );
 
    // Store the result back in the global memory.
-   if( tid == 0 )
-      output[ blockIdx.x ] = sdata[ 0 ];
+   if( threadIdx.x == 0 )
+      output[ blockIdx.x ] = result;
 }
 
 template< int blockSize,
-          typename Result,
           typename DataFetcher,
           typename Reduction,
+          typename Result,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionWithArgumentKernel( const Result zero,
-                                 DataFetcher dataFetcher,
+CudaReductionWithArgumentKernel( DataFetcher dataFetcher,
                                  const Reduction reduction,
-                                 const Index begin,
-                                 const Index end,
+                                 Result identity,
+                                 Index begin,
+                                 Index end,
                                  Result* output,
                                  Index* idxOutput,
                                  const Index* idxInput = nullptr )
 {
    TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaReductionKernel" );
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   constexpr int shmemElements = (blockSize <= 32) ? 2 * blockSize : blockSize;
-   __shared__ Result sdata[shmemElements];
-   __shared__ Index sidx[shmemElements];
-
-   // Get the thread id (tid), global thread id (gid) and gridSize.
-   const Index tid = threadIdx.x;
-         Index gid = begin + blockIdx.x * blockDim.x + threadIdx.x;
+
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduceWithArgument< blockSize, Reduction, Result, Index >;
+   union Shared {
+      typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+
+   // Calculate the grid size (stride of the sequential reduction loop).
    const Index gridSize = blockDim.x * gridDim.x;
+   // Shift the input lower bound by the thread index in the grid.
+   begin += blockIdx.x * blockDim.x + threadIdx.x;
+
+   // TODO: initialIndex should be passed as an argument to the kernel
+   Index initialIndex;
 
    // Start with the sequential reduction and push the result into the shared memory.
+   Result result = identity;
    if( idxInput ) {
-      if( gid < end ) {
-         sdata[ tid ] = dataFetcher( gid );
-         sidx[ tid ] = idxInput[ gid ];
-         gid += gridSize;
-      } else {
-         sdata[ tid ] = zero;
+      if( begin < end ) {
+         result = dataFetcher( begin );
+         initialIndex = idxInput[ begin ];
+         begin += gridSize;
       }
-      while( gid + 4 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
-         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], idxInput[ gid + 2 * gridSize ] );
-         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], idxInput[ gid + 3 * gridSize ] );
-         gid += 4 * gridSize;
+      while( begin + 4 * gridSize < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         reduction( result, dataFetcher( begin + 2 * gridSize ), initialIndex, idxInput[ begin + 2 * gridSize ] );
+         reduction( result, dataFetcher( begin + 3 * gridSize ), initialIndex, idxInput[ begin + 3 * gridSize ] );
+         begin += 4 * gridSize;
       }
-      while( gid + 2 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], idxInput[ gid + gridSize ] );
-         gid += 2 * gridSize;
+      while( begin + 2 * gridSize < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, idxInput[ begin + gridSize ] );
+         begin += 2 * gridSize;
       }
-      while( gid < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], idxInput[ gid ] );
-         gid += gridSize;
+      while( begin < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, idxInput[ begin ] );
+         begin += gridSize;
       }
    }
    else {
-      if( gid < end ) {
-         sdata[ tid ] = dataFetcher( gid );
-         sidx[ tid ] = gid;
-         gid += gridSize;
-      } else {
-         sdata[ tid ] = zero;
+      if( begin < end ) {
+         result = dataFetcher( begin );
+         initialIndex = begin;
+         begin += gridSize;
       }
-      while( gid + 4 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
-         reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ), sidx[ tid ], gid + 2 * gridSize );
-         reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ), sidx[ tid ], gid + 3 * gridSize );
-         gid += 4 * gridSize;
+      while( begin + 4 * gridSize < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         reduction( result, dataFetcher( begin + 2 * gridSize ), initialIndex, begin + 2 * gridSize );
+         reduction( result, dataFetcher( begin + 3 * gridSize ), initialIndex, begin + 3 * gridSize );
+         begin += 4 * gridSize;
       }
-      while( gid + 2 * gridSize < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         reduction( sdata[ tid ], dataFetcher( gid + gridSize ), sidx[ tid ], gid + gridSize );
-         gid += 2 * gridSize;
+      while( begin + 2 * gridSize < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
+         reduction( result, dataFetcher( begin + gridSize ), initialIndex, begin + gridSize );
+         begin += 2 * gridSize;
       }
-      while( gid < end ) {
-         reduction( sdata[ tid ], dataFetcher( gid ), sidx[ tid ], gid );
-         gid += gridSize;
+      while( begin < end ) {
+         reduction( result, dataFetcher( begin ), initialIndex, begin );
+         begin += gridSize;
       }
    }
    __syncthreads();
 
    // Perform the parallel reduction.
-   if( blockSize >= 1024 ) {
-      if( tid < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ], sidx[ tid ], sidx[ tid + 512 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 512 ) {
-      if( tid < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ], sidx[ tid ], sidx[ tid + 256 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 256 ) {
-      if( tid < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ], sidx[ tid ], sidx[ tid + 128 ] );
-      __syncthreads();
-   }
-   if( blockSize >= 128 ) {
-      if( tid <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ], sidx[ tid ], sidx[ tid + 64 ] );
-      __syncthreads();
-   }
-
-   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
-   if( tid < 32 ) {
-      if( blockSize >= 64 )
-         reduction( sdata[ tid ], sdata[ tid + 32 ], sidx[ tid ], sidx[ tid + 32 ] );
-      __syncwarp();
-      // Note that here we do not have to check if tid < 16 etc, because we have
-      // 2 * blockSize.x elements of shared memory per block, so we do not
-      // access out of bounds. The results for the upper half will be undefined,
-      // but unused anyway.
-      if( blockSize >= 32 )
-         reduction( sdata[ tid ], sdata[ tid + 16 ], sidx[ tid ], sidx[ tid + 16 ] );
-      __syncwarp();
-      if( blockSize >= 16 )
-         reduction( sdata[ tid ], sdata[ tid + 8 ], sidx[ tid ], sidx[ tid + 8 ] );
-      __syncwarp();
-      if( blockSize >=  8 )
-         reduction( sdata[ tid ], sdata[ tid + 4 ], sidx[ tid ], sidx[ tid + 4 ] );
-      __syncwarp();
-      if( blockSize >=  4 )
-         reduction( sdata[ tid ], sdata[ tid + 2 ], sidx[ tid ], sidx[ tid + 2 ] );
-      __syncwarp();
-      if( blockSize >=  2 )
-         reduction( sdata[ tid ], sdata[ tid + 1 ], sidx[ tid ], sidx[ tid + 1 ] );
-   }
+   const std::pair< Result, Index > result_pair = BlockReduce::reduceWithArgument( reduction, identity, result, initialIndex, threadIdx.x, storage.blockReduceStorage );
 
    // Store the result back in the global memory.
-   if( tid == 0 ) {
-      output[ blockIdx.x ] = sdata[ 0 ];
-      idxOutput[ blockIdx.x ] = sidx[ 0 ];
+   if( threadIdx.x == 0 ) {
+      output[ blockIdx.x ] = result_pair.first;
+      idxOutput[ blockIdx.x ] = result_pair.second;
    }
 }
 #endif
@@ -330,7 +560,7 @@ struct CudaReductionKernelLauncher
              typename Reduction >
    int start( const Reduction& reduction,
               DataFetcher& dataFetcher,
-              const Result& zero,
+              const Result& identity,
               Result*& output )
    {
       // create reference to the reduction buffer singleton and set size
@@ -339,7 +569,7 @@ struct CudaReductionKernelLauncher
       cudaReductionBuffer.setSize( buf_size );
       output = cudaReductionBuffer.template getData< Result >();
 
-      this->reducedSize = this->launch( begin, end, reduction, dataFetcher, zero, output );
+      this->reducedSize = this->launch( begin, end, reduction, dataFetcher, identity, output );
       return this->reducedSize;
    }
 
@@ -347,7 +577,7 @@ struct CudaReductionKernelLauncher
              typename Reduction >
    int startWithArgument( const Reduction& reduction,
                           DataFetcher& dataFetcher,
-                          const Result& zero,
+                          const Result& identity,
                           Result*& output,
                           Index*& idxOutput )
    {
@@ -358,14 +588,14 @@ struct CudaReductionKernelLauncher
       output = cudaReductionBuffer.template getData< Result >();
       idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );
 
-      this->reducedSize = this->launchWithArgument( begin, end, reduction, dataFetcher, zero, output, idxOutput, nullptr );
+      this->reducedSize = this->launchWithArgument( begin, end, reduction, dataFetcher, identity, output, idxOutput, nullptr );
       return this->reducedSize;
    }
 
    template< typename Reduction >
    Result
    finish( const Reduction& reduction,
-           const Result& zero )
+           const Result& identity )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
@@ -376,7 +606,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launch( 0, this->reducedSize, reduction, copyFetch, zero, output );
+         this->reducedSize = this->launch( 0, this->reducedSize, reduction, copyFetch, identity, output );
          std::swap( input, output );
       }
 
@@ -393,7 +623,7 @@ struct CudaReductionKernelLauncher
    template< typename Reduction >
    std::pair< Result, Index >
    finishWithArgument( const Reduction& reduction,
-                       const Result& zero )
+                       const Result& identity )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
@@ -406,7 +636,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launchWithArgument( ( Index ) 0, this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput );
+         this->reducedSize = this->launchWithArgument( ( Index ) 0, this->reducedSize, reduction, copyFetch, identity, output, idxOutput, idxInput );
          std::swap( input, output );
          std::swap( idxInput, idxOutput );
       }
@@ -432,7 +662,7 @@ struct CudaReductionKernelLauncher
                   const Index end,
                   const Reduction& reduction,
                   DataFetcher& dataFetcher,
-                  const Result& zero,
+                  const Result& identity,
                   Result* output )
       {
 #ifdef HAVE_CUDA
@@ -449,55 +679,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionKernel< 512 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 256, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 256 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 128, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 128 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  64, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  64 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  32, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  32 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  16, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  16 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   8, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   8 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   4, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   4 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   2, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   2 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output);
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output);
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -510,11 +740,11 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
             // shared memory is allocated statically inside the kernel
             CudaReductionKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output);
+            <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, begin, end, output);
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
@@ -535,7 +765,7 @@ struct CudaReductionKernelLauncher
                               const Index end,
                               const Reduction& reduction,
                               DataFetcher& dataFetcher,
-                              const Result& zero,
+                              const Result& identity,
                               Result* output,
                               Index* idxOutput,
                               const Index* idxInput )
@@ -554,55 +784,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionWithArgumentKernel< 512 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 256 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 128 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  64 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  32 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  16 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   8 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   4 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   2 >
-               <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, size, output, idxOutput, idxInput );
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -615,11 +845,11 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, DataFetcher, Reduction, Result, Index >, cudaFuncCachePreferShared);
 
             // shared memory is allocated statically inside the kernel
             CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize >>>( zero, dataFetcher, reduction, begin, end, output, idxOutput, idxInput );
+            <<< gridSize, blockSize >>>( dataFetcher, reduction, identity, begin, end, output, idxOutput, idxInput );
             cudaStreamSynchronize(0);
             TNL_CHECK_CUDA_DEVICE;
          }
diff --git a/src/TNL/Algorithms/detail/CudaScanKernel.h b/src/TNL/Algorithms/detail/CudaScanKernel.h
index 63072ea893da12f6790f364d7d456f2c84b06945..be1c5a380aeb9e5e3b268bdf600182c382146f2d 100644
--- a/src/TNL/Algorithms/detail/CudaScanKernel.h
+++ b/src/TNL/Algorithms/detail/CudaScanKernel.h
@@ -10,363 +10,941 @@
 
 #pragma once
 
-#include <iostream>
-
 #include <TNL/Math.h>
 #include <TNL/Cuda/SharedMemory.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
 #include <TNL/Containers/Array.h>
+#include "ScanType.h"
 
 namespace TNL {
 namespace Algorithms {
 namespace detail {
 
 #ifdef HAVE_CUDA
-
-template< typename Real,
+/* Template for cooperative scan across the CUDA block of threads.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ *
+ * The default implementation is generic and the reduction is done using
+ * shared memory. Specializations can be made based on `Reduction` and
+ * `ValueType`, e.g. using the `__shfl_sync` intrinsics for supported
+ * value types.
+ */
+template< ScanType scanType,
+          int blockSize,
           typename Reduction,
-          typename Index >
-__global__ void
-cudaFirstPhaseBlockScan( const ScanType scanType,
-                         Reduction reduction,
-                         const Real zero,
-                         const Index size,
-                         const int elementsInBlock,
-                         const Real* input,
-                         Real* output,
-                         Real* auxArray )
+          typename ValueType >
+struct CudaBlockScan
 {
-   Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
-   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
-   Real* warpSums = &auxData[ blockDim.x ];
-
-   const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
-   const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock );
+   // storage to be allocated in shared memory
+   struct Storage
+   {
+      ValueType chunkResults[ blockSize + blockSize / Cuda::getNumberOfSharedMemoryBanks() ];  // accessed via Cuda::getInterleaving()
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
 
-   /***
-    * Load data into the shared memory.
+   /* Cooperative scan across the CUDA block - each thread will get the
+    * result of the scan according to its ID.
+    *
+    * \param reduction    The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue  Value of the calling thread to be reduced.
+    * \param tid          Index of the calling thread (usually `threadIdx.x`,
+    *                     unless you know what you are doing).
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
     */
-   const int blockOffset = blockIdx.x * elementsInBlock;
-   int idx = threadIdx.x;
-   if( scanType == ScanType::Exclusive )
+   __device__ static
+   ValueType
+   scan( const Reduction& reduction,
+         ValueType identity,
+         ValueType threadValue,
+         int tid,
+         Storage& storage )
    {
-      if( idx == 0 )
-         sharedData[ 0 ] = zero;
-      while( idx < elementsInBlock && blockOffset + idx < size )
-      {
-         sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
-         idx += blockDim.x;
+      // verify the configuration
+      TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaBlockScan::scan" );
+      static_assert( blockSize / Cuda::getWarpSize() <= Cuda::getWarpSize(),
+                     "blockSize is too large, it would not be possible to scan warpResults using one warp" );
+
+      // store the threadValue in the shared memory
+      const int chunkResultIdx = Cuda::getInterleaving( tid );
+      storage.chunkResults[ chunkResultIdx ] = threadValue;
+      __syncthreads();
+
+      // perform the parallel scan on chunkResults inside warps
+      const int lane_id = tid % Cuda::getWarpSize();
+      const int warp_id = tid / Cuda::getWarpSize();
+      #pragma unroll
+      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
+         if( lane_id >= stride ) {
+            storage.chunkResults[ chunkResultIdx ] = reduction( storage.chunkResults[ chunkResultIdx ], storage.chunkResults[ Cuda::getInterleaving( tid - stride ) ] );
+         }
+         __syncwarp();
+      }
+      threadValue = storage.chunkResults[ chunkResultIdx ];
+
+      // the last thread in warp stores the intermediate result in warpResults
+      if( lane_id == Cuda::getWarpSize() - 1 )
+         storage.warpResults[ warp_id ] = threadValue;
+      __syncthreads();
+
+      // perform the scan of warpResults using one warp
+      if( warp_id == 0 )
+         #pragma unroll
+         for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
+            if( lane_id >= stride )
+               storage.warpResults[ tid ] = reduction( storage.warpResults[ tid ], storage.warpResults[ tid - stride ] );
+            __syncwarp();
+         }
+      __syncthreads();
+
+      // shift threadValue by the warpResults
+      if( warp_id > 0 )
+         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );
+
+      // shift the result for exclusive scan
+      if( scanType == ScanType::Exclusive ) {
+         storage.chunkResults[ chunkResultIdx ] = threadValue;
+         __syncthreads();
+         threadValue = (tid == 0) ? identity : storage.chunkResults[ Cuda::getInterleaving( tid - 1 ) ];
       }
+
+      __syncthreads();
+      return threadValue;
    }
-   else
+};
+
+template< ScanType scanType,
+          int __unused,  // the __shfl implementation does not depend on the blockSize
+          typename Reduction,
+          typename ValueType >
+struct CudaBlockScanShfl
+{
+   // storage to be allocated in shared memory
+   struct Storage
    {
-      while( idx < elementsInBlock && blockOffset + idx < size )
-      {
-         sharedData[ Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
-         idx += blockDim.x;
+      ValueType warpResults[ Cuda::getWarpSize() ];
+   };
+
+   /* Cooperative scan across the CUDA block - each thread will get the
+    * result of the scan according to its ID.
+    *
+    * \param reduction    The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param threadValue  Value of the calling thread to be reduced.
+    * \param tid          Index of the calling thread (usually `threadIdx.x`,
+    *                     unless you know what you are doing).
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
+    */
+   __device__ static
+   ValueType
+   scan( const Reduction& reduction,
+         ValueType identity,
+         ValueType threadValue,
+         int tid,
+         Storage& storage )
+   {
+      const int lane_id = tid % Cuda::getWarpSize();
+      const int warp_id = tid / Cuda::getWarpSize();
+
+      // perform the parallel scan across warps
+      ValueType total;
+      threadValue = warpScan< scanType >( reduction, identity, threadValue, lane_id, total );
+
+      // the last thread in warp stores the result of inclusive scan in warpResults
+      if( lane_id == Cuda::getWarpSize() - 1 )
+         storage.warpResults[ warp_id ] = total;
+      __syncthreads();
+
+      // the first warp performs the scan of warpResults
+      if( warp_id == 0 ) {
+         // read from shared memory only if that warp existed
+         if( tid < blockDim.x / Cuda::getWarpSize() )
+            total = storage.warpResults[ lane_id ];
+         else
+            total = identity;
+         storage.warpResults[ lane_id ] = warpScan< ScanType::Inclusive >( reduction, identity, total, lane_id, total );
       }
+      __syncthreads();
+
+      // shift threadValue by the warpResults
+      if( warp_id > 0 )
+         threadValue = reduction( threadValue, storage.warpResults[ warp_id - 1 ] );
+
+      __syncthreads();
+      return threadValue;
    }
 
-   /***
-    * Perform the sequential prefix-sum.
+   /* Helper function.
+    * Cooperative scan across the warp - each thread will get the result of the
+    * scan according to its ID.
+    * return value = thread's result of the *warpScanType* scan
+    * total = thread's result of the *inclusive* scan
     */
-   __syncthreads();
-   const int chunkSize = elementsInBlock / blockDim.x;
-   const int chunkOffset = threadIdx.x * chunkSize;
-   const int numberOfChunks = roundUpDivision( lastElementInBlock, chunkSize );
-
-   if( chunkOffset < lastElementInBlock )
+   template< ScanType warpScanType >
+   __device__ static
+   ValueType
+   warpScan( const Reduction& reduction,
+             ValueType identity,
+             ValueType threadValue,
+             int lane_id,
+             ValueType& total )
    {
-      auxData[ threadIdx.x ] =
-         sharedData[ Cuda::getInterleaving( chunkOffset ) ];
+      constexpr unsigned mask = 0xffffffff;
+
+      // perform an inclusive scan
+      #pragma unroll
+      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
+         const ValueType otherValue = __shfl_up_sync( mask, threadValue, stride );
+         if( lane_id >= stride )
+            threadValue = reduction( threadValue, otherValue );
+      }
+
+      // set the result of the inclusive scan
+      total = threadValue;
+
+      // shift the result for exclusive scan
+      if( warpScanType == ScanType::Exclusive ) {
+         threadValue = __shfl_up_sync( mask, threadValue, 1 );
+         if( lane_id == 0 )
+            threadValue = identity;
+      }
+
+      return threadValue;
    }
+};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, int >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, int >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned int >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned int >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, long long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, long long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, unsigned long long >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, unsigned long long >
+{};
+
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, float >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, float >
+{};
 
-   int chunkPointer = 1;
-   while( chunkPointer < chunkSize &&
-          chunkOffset + chunkPointer < lastElementInBlock )
+template< ScanType scanType,
+          int blockSize,
+          typename Reduction >
+struct CudaBlockScan< scanType, blockSize, Reduction, double >
+: public CudaBlockScanShfl< scanType, blockSize, Reduction, double >
+{};
+
+/* Template for cooperative scan of a data tile in the global memory.
+ * It is a *cooperative* operation - all threads must call the operation,
+ * otherwise it will deadlock!
+ */
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename Reduction,
+          typename ValueType >
+struct CudaTileScan
+{
+   using BlockScan = CudaBlockScan< ScanType::Exclusive, blockSize, Reduction, ValueType >;
+
+   // storage to be allocated in shared memory
+   struct Storage
    {
-      sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
-         reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                    sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
-      auxData[ threadIdx.x ] =
-         sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
-      chunkPointer++;
-   }
+      ValueType data[ blockSize * valuesPerThread ];
+      typename BlockScan::Storage blockScanStorage;
+   };
 
-   /***
-    *  Perform the parallel prefix-sum inside warps.
+   /* Cooperative scan of a data tile in the global memory - each thread will
+    * get the result of its chunk (i.e. the last value of the (inclusive) scan
+    * in the chunk) according to the thread ID.
+    *
+    * \param input        The input array to be scanned.
+    * \param output       The array where the result will be stored.
+    * \param begin        The first element in the array to be scanned.
+    * \param end          the last element in the array to be scanned.
+    * \param outputBegin  The first element in the output array to be written. There
+    *                     must be at least `end - begin` elements in the output
+    *                     array starting at the position given by `outputBegin`.
+    * \param reduction    The binary reduction functor.
+    * \param identity     Neutral element for given reduction operation, i.e.
+    *                     value such that `reduction(identity, x) == x` for any `x`.
+    * \param shift        A global shift to be applied to all elements in the
+    *                     chunk processed by this thread.
+    * \param storage      Auxiliary storage (must be allocated as a __shared__
+    *                     variable).
     */
-   const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
-   const int warpIdx = threadIdx.x / Cuda::getWarpSize();
-   for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
-      if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
-         auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
-      __syncwarp();
-   }
+   template< typename InputView,
+             typename OutputView >
+   __device__ static
+   ValueType
+   scan( const InputView input,
+         OutputView output,
+         typename InputView::IndexType begin,
+         typename InputView::IndexType end,
+         typename OutputView::IndexType outputBegin,
+         const Reduction& reduction,
+         ValueType identity,
+         ValueType shift,
+         Storage& storage )
+   {
+      // verify the configuration
+      TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaTileScan::scan" );
+      static_assert( valuesPerThread % 2,
+                     "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
+                     "when threads access their chunks in shared memory sequentially" );
+
+      // calculate indices
+      constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+      const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
+      const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
+
+      // update global array offsets for the thread
+      const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+      begin += threadOffset;
+      outputBegin += threadOffset;
+
+      // Load data into the shared memory.
+      {
+         int idx = threadIdx.x;
+         while( idx < elementsInBlock )
+         {
+            storage.data[ idx ] = input[ begin ];
+            begin += blockDim.x;
+            idx += blockDim.x;
+         }
+         // fill the remaining (maxElementsInBlock - elementsInBlock) values with identity
+         // (this helps to avoid divergent branches in the blocks below)
+         while( idx < maxElementsInBlock )
+         {
+            storage.data[ idx ] = identity;
+            idx += blockDim.x;
+         }
+      }
+      __syncthreads();
 
-   if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
-      warpSums[ warpIdx ] = auxData[ threadIdx.x ];
-   __syncthreads();
+      // Perform sequential reduction of the thread's chunk in shared memory.
+      const int chunkOffset = threadIdx.x * valuesPerThread;
+      ValueType value = storage.data[ chunkOffset ];
+      #pragma unroll
+      for( int i = 1; i < valuesPerThread; i++ )
+         value = reduction( value, storage.data[ chunkOffset + i ] );
 
-   /****
-    * Compute prefix-sum of warp sums using one warp
-    */
-   if( warpIdx == 0 )
-      for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
-         if( threadInWarpIdx >= stride )
-            warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
-         __syncwarp();
+      // Scan the spine to obtain the initial value ("offset") for the downsweep.
+      value = BlockScan::scan( reduction, identity, value, threadIdx.x, storage.blockScanStorage );
+
+      // Apply the global shift.
+      value = reduction( value, shift );
+
+      // Downsweep step: scan the chunks and use the result of spine scan as the initial value.
+      #pragma unroll
+      for( int i = 0; i < valuesPerThread; i++ )
+      {
+         const ValueType inputValue = storage.data[ chunkOffset + i ];
+         if( scanType == ScanType::Exclusive )
+            storage.data[ chunkOffset + i ] = value;
+         value = reduction( value, inputValue );
+         if( scanType == ScanType::Inclusive )
+            storage.data[ chunkOffset + i ] = value;
       }
-   __syncthreads();
+      __syncthreads();
 
-   /****
-    * Shift the warp prefix-sums.
-    */
-   if( warpIdx > 0 )
-      auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
-   __syncthreads();
+      // Store the result back in the global memory.
+      {
+         int idx = threadIdx.x;
+         while( idx < elementsInBlock )
+         {
+            output[ outputBegin ] = storage.data[ idx ];
+            outputBegin += blockDim.x;
+            idx += blockDim.x;
+         }
+      }
 
-   /***
-    *  Store the result back in global memory.
-    */
-   idx = threadIdx.x;
-   while( idx < elementsInBlock && blockOffset + idx < size )
-   {
-      const int chunkIdx = idx / chunkSize;
-      Real chunkShift( zero );
-      if( chunkIdx > 0 )
-         chunkShift = auxData[ chunkIdx - 1 ];
-      sharedData[ Cuda::getInterleaving( idx ) ] =
-         reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
-      output[ blockOffset + idx ] = sharedData[ Cuda::getInterleaving( idx ) ];
-      idx += blockDim.x;
+      // Return the last (inclusive) scan value of the chunk processed by this thread.
+      return value;
    }
-   __syncthreads();
+};
 
-   if( threadIdx.x == 0 )
+/* CudaScanKernelUpsweep - compute partial reductions per each CUDA block.
+ */
+template< int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename Reduction,
+          typename ValueType >
+__global__ void
+CudaScanKernelUpsweep( const InputView input,
+                       typename InputView::IndexType begin,
+                       typename InputView::IndexType end,
+                       Reduction reduction,
+                       ValueType identity,
+                       ValueType* reductionResults )
+{
+   // verify the configuration
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelUpsweep" );
+   static_assert( valuesPerThread % 2,
+                  "valuesPerThread must be odd, otherwise there would be shared memory bank conflicts "
+                  "when threads access their chunks in shared memory sequentially" );
+
+   // allocate shared memory
+   using BlockReduce = CudaBlockReduce< blockSize, Reduction, ValueType >;
+   union Shared {
+      ValueType data[ blockSize * valuesPerThread ];
+      typename BlockReduce::Storage blockReduceStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+
+   // calculate indices
+   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+   const int remainingElements = end - begin - blockIdx.x * maxElementsInBlock;
+   const int elementsInBlock = TNL::min( remainingElements, maxElementsInBlock );
+
+   // update global array offset for the thread
+   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+   begin += threadOffset;
+
+   // Load data into the shared memory.
    {
-      if( scanType == ScanType::Exclusive )
+      int idx = threadIdx.x;
+      while( idx < elementsInBlock )
       {
-         auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
-                                             sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
+         storage.data[ idx ] = input[ begin ];
+         begin += blockDim.x;
+         idx += blockDim.x;
+      }
+      // fill the remaining (maxElementsInBlock - elementsInBlock) values with identity
+      // (this helps to avoid divergent branches in the blocks below)
+      while( idx < maxElementsInBlock )
+      {
+         storage.data[ idx ] = identity;
+         idx += blockDim.x;
       }
-      else
-         auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
    }
+   __syncthreads();
+
+   // Perform sequential reduction of the thread's chunk in shared memory.
+   const int chunkOffset = threadIdx.x * valuesPerThread;
+   ValueType value = storage.data[ chunkOffset ];
+   #pragma unroll
+   for( int i = 1; i < valuesPerThread; i++ )
+      value = reduction( value, storage.data[ chunkOffset + i ] );
+   __syncthreads();
+
+   // Perform the parallel reduction.
+   value = BlockReduce::reduce( reduction, identity, value, threadIdx.x, storage.blockReduceStorage );
+
+   // Store the block result in the global memory.
+   if( threadIdx.x == 0 )
+      reductionResults[ blockIdx.x ] = value;
 }
 
-template< typename Real,
-          typename Reduction,
-          typename Index >
+/* CudaScanKernelDownsweep - scan each tile of the input separately in each CUDA
+ * block and use the result of spine scan as the initial value
+ */
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename OutputView,
+          typename Reduction >
+__global__ void
+CudaScanKernelDownsweep( const InputView input,
+                         OutputView output,
+                         typename InputView::IndexType begin,
+                         typename InputView::IndexType end,
+                         typename OutputView::IndexType outputBegin,
+                         Reduction reduction,
+                         typename OutputView::ValueType identity,
+                         typename OutputView::ValueType shift,
+                         const typename OutputView::ValueType* reductionResults )
+{
+   using ValueType = typename OutputView::ValueType;
+   using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
+
+   // allocate shared memory
+   union Shared {
+      typename TileScan::Storage tileScanStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+
+   // load the reduction of the previous tiles
+   shift = reduction( shift, reductionResults[ blockIdx.x ] );
+
+   // scan from input into output
+   TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, shift, storage.tileScanStorage );
+}
+
+/* CudaScanKernelParallel - scan each tile of the input separately in each CUDA
+ * block (first phase to be followed by CudaScanKernelUniformShift when there
+ * are multiple CUDA blocks).
+ */
+template< ScanType scanType,
+          int blockSize,
+          int valuesPerThread,
+          typename InputView,
+          typename OutputView,
+          typename Reduction >
 __global__ void
-cudaSecondPhaseBlockScan( Reduction reduction,
-                          const Index size,
-                          const int elementsInBlock,
-                          const Index gridIdx,
-                          const Index maxGridSize,
-                          const Real* auxArray,
-                          Real* data,
-                          Real shift )
+CudaScanKernelParallel( const InputView input,
+                        OutputView output,
+                        typename InputView::IndexType begin,
+                        typename InputView::IndexType end,
+                        typename OutputView::IndexType outputBegin,
+                        Reduction reduction,
+                        typename OutputView::ValueType identity,
+                        typename OutputView::ValueType* blockResults )
 {
-   if( gridIdx > 0 || blockIdx.x > 0 )
-      shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
-   const int readOffset = blockIdx.x * elementsInBlock;
-   int readIdx = threadIdx.x;
-   while( readIdx < elementsInBlock && readOffset + readIdx < size )
+   using ValueType = typename OutputView::ValueType;
+   using TileScan = CudaTileScan< scanType, blockSize, valuesPerThread, Reduction, ValueType >;
+
+   // allocate shared memory
+   union Shared {
+      typename TileScan::Storage tileScanStorage;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+
+   // scan from input into output
+   const ValueType value = TileScan::scan( input, output, begin, end, outputBegin, reduction, identity, identity, storage.tileScanStorage );
+
+   // The last thread of the block stores the block result in the global memory.
+   if( blockResults && threadIdx.x == blockDim.x - 1 )
+      blockResults[ blockIdx.x ] = value;
+}
+
+/* CudaScanKernelUniformShift - apply a uniform shift to a pre-scanned output
+ * array.
+ *
+ * \param blockResults  An array of per-block shifts coming from the first phase
+ *                      (computed by CudaScanKernelParallel)
+ * \param shift         A global shift to be applied to all elements of the
+ *                      output array.
+ */
+template< int blockSize,
+          int valuesPerThread,
+          typename OutputView,
+          typename Reduction >
+__global__ void
+CudaScanKernelUniformShift( OutputView output,
+                            typename OutputView::IndexType outputBegin,
+                            typename OutputView::IndexType outputEnd,
+                            Reduction reduction,
+                            const typename OutputView::ValueType* blockResults,
+                            typename OutputView::ValueType shift )
+{
+   // load the block result into a __shared__ variable first
+   union Shared {
+      typename OutputView::ValueType blockResult;
+
+      // initialization is not allowed for __shared__ variables, so we need to
+      // disable initialization in the implicit default constructor
+      Shared() {}
+   };
+   __shared__ Shared storage;
+   if( threadIdx.x == 0 )
+      storage.blockResult = blockResults[ blockIdx.x ];
+
+   // update the output offset for the thread
+   TNL_ASSERT_EQ( blockDim.x, blockSize, "unexpected block size in CudaScanKernelUniformShift" );
+   constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+   const int threadOffset = blockIdx.x * maxElementsInBlock + threadIdx.x;
+   outputBegin += threadOffset;
+
+   // update the block shift
+   __syncthreads();
+   shift = reduction( shift, storage.blockResult );
+
+   int valueIdx = 0;
+   while( valueIdx < valuesPerThread && outputBegin < outputEnd )
    {
-      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
-      readIdx += blockDim.x;
+      output[ outputBegin ] = reduction( output[ outputBegin ], shift );
+      outputBegin += blockDim.x;
+      valueIdx++;
    }
 }
 
+/**
+ * \tparam blockSize  The CUDA block size to be used for kernel launch.
+ * \tparam valuesPerThread  Number of elements processed by each thread sequentially.
+ */
 template< ScanType scanType,
-          typename Real,
-          typename Index >
+          ScanPhaseType phaseType,
+          typename ValueType,
+          // use blockSize=256 for 32-bit value types, scale with sizeof(ValueType)
+          // to keep shared memory requirements constant
+          int blockSize = 256 * 4 / sizeof(ValueType),
+          // valuesPerThread should be odd to avoid shared memory bank conflicts
+          int valuesPerThread = 7 >
 struct CudaScanKernelLauncher
 {
    /****
     * \brief Performs both phases of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceInput  Pointer to input data on GPU.
-    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param zero  Neutral element for given reduction operation, i.e. value such that
-    *              `reduction(zero, x) == x` for any `x`.
-    * \param blockSize  The CUDA block size to be used for kernel launch.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
     */
-   template< typename Reduction >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
    static void
-   perform( const Index size,
-            const Real* deviceInput,
-            Real* deviceOutput,
-            Reduction& reduction,
-            const Real zero,
-            const int blockSize = 256 )
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType identity )
    {
       const auto blockShifts = performFirstPhase(
-         size,
-         deviceInput,
-         deviceOutput,
+         input,
+         output,
+         begin,
+         end,
+         outputBegin,
          reduction,
-         zero,
-         blockSize );
+         identity );
+
+      // if the first-phase kernel was launched with just one block, skip the second phase
+      if( blockShifts.getSize() <= 2 )
+         return;
+
       performSecondPhase(
-         size,
-         deviceOutput,
-         blockShifts.getData(),
+         input,
+         output,
+         blockShifts,
+         begin,
+         end,
+         outputBegin,
          reduction,
-         zero,
-         blockSize );
+         identity,
+         identity );
    }
 
    /****
     * \brief Performs the first phase of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceInput  Pointer to input data on GPU.
-    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param zero  Neutral value for given reduction operation, i.e. value such that
-    *              `reduction(zero, x) == x` for any `x`.
-    * \param blockSize  The CUDA block size to be used for kernel launch.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
     */
-   template< typename Reduction >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
    static auto
-   performFirstPhase( const Index size,
-                      const Real* deviceInput,
-                      Real* deviceOutput,
-                      Reduction& reduction,
-                      const Real zero,
-                      const int blockSize = 256 )
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType identity )
    {
-      // compute the number of grids
-      const int elementsInBlock = 8 * blockSize;
-      const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
-
-      // allocate array for the block sums
-      Containers::Array< Real, Devices::Cuda > blockSums;
-      blockSums.setSize( numberOfBlocks );
-
-      // loop over all grids
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid size and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
-         Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize() )
-            currentSize = maxGridSize() * elementsInBlock;
-         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
-
-         // setup block and grid size
-         dim3 cudaBlockSize, cudaGridSize;
-         cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
-
-         // run the kernel
-         const std::size_t sharedDataSize = elementsInBlock +
-                                            elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
-         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Cuda::getWarpSize() ) * sizeof( Real );
-         cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-            ( scanType,
-              reduction,
-              zero,
-              currentSize,
-              elementsInBlock,
-              &deviceInput[ gridOffset ],
-              &deviceOutput[ gridOffset ],
-              &blockSums.getData()[ gridIdx * maxGridSize() ] );
+      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
+      using Index = typename InputArray::IndexType;
+
+      if( end - begin <= blockSize * valuesPerThread ) {
+         // allocate array for the block results
+         Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
+         blockResults.setSize( 2 );
+         blockResults.setElement( 0, identity );
+
+         // run the kernel with just 1 block
+         if( end - begin <= blockSize )
+            CudaScanKernelParallel< scanType, blockSize, 1 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
+                 &blockResults.getData()[ 1 ] );
+         else if( end - begin <= blockSize * 3 )
+            CudaScanKernelParallel< scanType, blockSize, 3 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
+                 &blockResults.getData()[ 1 ] );
+         else if( end - begin <= blockSize * 5 )
+            CudaScanKernelParallel< scanType, blockSize, 5 ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
+                 &blockResults.getData()[ 1 ] );
+         else
+            CudaScanKernelParallel< scanType, blockSize, valuesPerThread ><<< 1, blockSize >>>
+               ( input.getConstView(),
+                 output.getView(),
+                 begin,
+                 end,
+                 outputBegin,
+                 reduction,
+                 identity,
+                 // blockResults are shifted by 1, because the 0-th element should stay identity
+                 &blockResults.getData()[ 1 ] );
+
+         // synchronize the null-stream
+         cudaStreamSynchronize(0);
+         TNL_CHECK_CUDA_DEVICE;
+
+         // Store the number of CUDA grids for the purpose of unit testing, i.e.
+         // to check if we test the algorithm with more than one CUDA grid.
+         gridsCount() = 1;
+
+         // blockResults now contains shift values for each block - to be used in the second phase
+         return blockResults;
       }
-
-      // synchronize the null-stream after all grids
-      cudaStreamSynchronize(0);
-      TNL_CHECK_CUDA_DEVICE;
-
-      // blockSums now contains sums of numbers in each block. The first phase
-      // ends by computing prefix-sum of this array.
-      if( numberOfBlocks > 1 ) {
-         CudaScanKernelLauncher< ScanType::Inclusive, Real, Index >::perform(
-            blockSums.getSize(),
-            blockSums.getData(),
-            blockSums.getData(),
+      else {
+         // compute the number of grids
+         constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+         const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
+         const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+
+         // allocate array for the block results
+         Containers::Array< typename OutputArray::ValueType, Devices::Cuda > blockResults;
+         blockResults.setSize( numberOfBlocks + 1 );
+
+         // loop over all grids
+         for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+            // compute current grid offset and size of data to be scanned
+            const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+            const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
+
+            // setup block and grid size
+            dim3 cudaBlockSize, cudaGridSize;
+            cudaBlockSize.x = blockSize;
+            cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
+
+            // run the kernel
+            switch( phaseType )
+            {
+               case ScanPhaseType::WriteInFirstPhase:
+                  CudaScanKernelParallel< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       output.getView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       outputBegin + gridOffset,
+                       reduction,
+                       identity,
+                       &blockResults.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+
+               case ScanPhaseType::WriteInSecondPhase:
+                  CudaScanKernelUpsweep< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       reduction,
+                       identity,
+                       &blockResults.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+            }
+         }
+
+         // synchronize the null-stream after all grids
+         cudaStreamSynchronize(0);
+         TNL_CHECK_CUDA_DEVICE;
+
+         // blockResults now contains scan results for each block. The first phase
+         // ends by computing an exclusive scan of this array.
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform(
+            blockResults,
+            blockResults,
+            0,
+            blockResults.getSize(),
+            0,
             reduction,
-            zero,
-            blockSize );
-      }
+            identity );
 
-      // Store the number of CUDA grids for the purpose of unit testing, i.e.
-      // to check if we test the algorithm with more than one CUDA grid.
-      gridsCount() = numberOfGrids;
+         // Store the number of CUDA grids for the purpose of unit testing, i.e.
+         // to check if we test the algorithm with more than one CUDA grid.
+         gridsCount() = numberOfGrids;
 
-      // blockSums now contains shift values for each block - to be used in the second phase
-      return blockSums;
+         // blockResults now contains shift values for each block - to be used in the second phase
+         return blockResults;
+      }
    }
 
    /****
     * \brief Performs the second phase of prefix sum.
     *
-    * \param size  Number of elements to be scanned.
-    * \param deviceOutput  Pointer to output array on GPU.
+    * \param input the input array to be scanned
+    * \param output the array where the result will be stored
     * \param blockShifts  Pointer to a GPU array containing the block shifts. It is the
     *                     result of the first phase.
-    * \param reduction  Symmetric binary function representing the reduction operation
-    *                   (usually addition, i.e. an instance of \ref std::plus).
-    * \param shift  A constant shifting all elements of the array (usually `zero`, i.e.
-    *               the neutral value).
-    * \param blockSize  The CUDA block size to be used for kernel launch.
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param outputBegin the first element in the output array to be written. There
+    *                    must be at least `end - begin` elements in the output
+    *                    array starting at the position given by `outputBegin`.
+    * \param reduction Symmetric binary function representing the reduction operation
+    *                  (usually addition, i.e. an instance of \ref std::plus).
+    * \param identity Neutral element for given reduction operation, i.e.
+    *                 value such that `reduction(identity, x) == x` for any `x`.
+    * \param shift A constant shifting all elements of the array (usually
+    *              `identity`, i.e. the neutral value).
     */
-   template< typename Reduction >
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
    static void
-   performSecondPhase( const Index size,
-                       Real* deviceOutput,
-                       const Real* blockShifts,
-                       Reduction& reduction,
-                       const Real shift,
-                       const Index blockSize = 256 )
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
+                       const BlockShifts& blockShifts,
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType identity,
+                       typename OutputArray::ValueType shift )
    {
-      // compute the number of grids
-      const int elementsInBlock = 8 * blockSize;
-      const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-
-      // loop over all grids
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
-         // compute current grid size and size of data to be scanned
-         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
-         Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize() )
-            currentSize = maxGridSize() * elementsInBlock;
-         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
-
-         // setup block and grid size
-         dim3 cudaBlockSize, cudaGridSize;
-         cudaBlockSize.x = blockSize;
-         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
-
-         // run the kernel
-         cudaSecondPhaseBlockScan<<< cudaGridSize, cudaBlockSize >>>
-            ( reduction,
-              size,
-              elementsInBlock,
-              gridIdx,
-              (Index) maxGridSize(),
-              blockShifts,
-              &deviceOutput[ gridOffset ],
+      static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
+      using Index = typename InputArray::IndexType;
+
+      // if the input was already scanned with just one block in the first phase,
+      // it must be shifted uniformly in the second phase
+      if( end - begin <= blockSize * valuesPerThread ) {
+         CudaScanKernelUniformShift< blockSize, valuesPerThread ><<< 1, blockSize >>>
+            ( output.getView(),
+              outputBegin,
+              outputBegin + end - begin,
+              reduction,
+              blockShifts.getData(),
               shift );
       }
+      else {
+         // compute the number of grids
+         constexpr int maxElementsInBlock = blockSize * valuesPerThread;
+         const Index numberOfBlocks = roundUpDivision( end - begin, maxElementsInBlock );
+         const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+
+         // loop over all grids
+         for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+            // compute current grid offset and size of data to be scanned
+            const Index gridOffset = gridIdx * maxGridSize() * maxElementsInBlock;
+            const Index currentSize = TNL::min( end - begin - gridOffset, maxGridSize() * maxElementsInBlock );
+
+            // setup block and grid size
+            dim3 cudaBlockSize, cudaGridSize;
+            cudaBlockSize.x = blockSize;
+            cudaGridSize.x = roundUpDivision( currentSize, maxElementsInBlock );
+
+            // run the kernel
+            switch( phaseType )
+            {
+               case ScanPhaseType::WriteInFirstPhase:
+                  CudaScanKernelUniformShift< blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( output.getView(),
+                       outputBegin + gridOffset,
+                       outputBegin + gridOffset + currentSize,
+                       reduction,
+                       &blockShifts.getData()[ gridIdx * maxGridSize() ],
+                       shift );
+                  break;
+
+               case ScanPhaseType::WriteInSecondPhase:
+                  CudaScanKernelDownsweep< scanType, blockSize, valuesPerThread ><<< cudaGridSize, cudaBlockSize >>>
+                     ( input.getConstView(),
+                       output.getView(),
+                       begin + gridOffset,
+                       begin + gridOffset + currentSize,
+                       outputBegin + gridOffset,
+                       reduction,
+                       identity,
+                       shift,
+                       &blockShifts.getData()[ gridIdx * maxGridSize() ] );
+                  break;
+            }
+         }
+      }
 
       // synchronize the null-stream after all grids
       cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
    }
 
-   /****
-    * The following serves for setting smaller maxGridSize so that we can force
-    * the prefix sum in CUDA to run with more the one grids in unit tests.
-    */
+   // The following serves for setting smaller maxGridSize so that we can force
+   // the scan in CUDA to run with more than one grid in unit tests.
    static int& maxGridSize()
    {
       static int maxGridSize = Cuda::getMaxGridSize();
@@ -376,6 +954,7 @@ struct CudaScanKernelLauncher
    static void resetMaxGridSize()
    {
       maxGridSize() = Cuda::getMaxGridSize();
+      gridsCount() = -1;
    }
 
    static int& gridsCount()
diff --git a/src/TNL/Algorithms/detail/DistributedScan.h b/src/TNL/Algorithms/detail/DistributedScan.h
new file mode 100644
index 0000000000000000000000000000000000000000..933056d9267560523e81dfdced6f15b72f740d66
--- /dev/null
+++ b/src/TNL/Algorithms/detail/DistributedScan.h
@@ -0,0 +1,74 @@
+/***************************************************************************
+                          DistributedScan.h  -  description
+                             -------------------
+    begin                : Aug 16, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include "Scan.h"
+
+#include <TNL/Containers/Array.h>
+#include <TNL/MPI/Wrappers.h>
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+template< ScanType Type, ScanPhaseType PhaseType >
+struct DistributedScan
+{
+   template< typename InputDistributedArray,
+             typename OutputDistributedArray,
+             typename Reduction >
+   static void
+   perform( const InputDistributedArray& input,
+            OutputDistributedArray& output,
+            typename InputDistributedArray::IndexType begin,
+            typename InputDistributedArray::IndexType end,
+            Reduction&& reduction,
+            typename OutputDistributedArray::ValueType identity )
+   {
+      using ValueType = typename OutputDistributedArray::ValueType;
+      using DeviceType = typename OutputDistributedArray::DeviceType;
+
+      const auto group = input.getCommunicationGroup();
+      if( group != MPI::NullGroup() ) {
+         // adjust begin and end for the local range
+         const auto localRange = input.getLocalRange();
+         begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
+         end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin();
+
+         // perform first phase on the local data
+         const auto inputLocalView = input.getConstLocalView();
+         auto outputLocalView = output.getLocalView();
+         const auto block_results = Scan< DeviceType, Type, PhaseType >::performFirstPhase( inputLocalView, outputLocalView, begin, end, begin, reduction, identity );
+         const ValueType local_result = block_results.getElement( block_results.getSize() - 1 );
+
+         // exchange local results between ranks
+         const int nproc = MPI::GetSize( group );
+         ValueType dataForScatter[ nproc ];
+         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = local_result;
+         Containers::Array< ValueType, Devices::Host > rank_results( nproc );
+         // NOTE: exchanging general data types does not work with MPI
+         MPI::Alltoall( dataForScatter, 1, rank_results.getData(), 1, group );
+
+         // compute the scan of the per-rank results
+         Scan< Devices::Host, ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform( rank_results, rank_results, 0, nproc, 0, reduction, identity );
+
+         // perform the second phase, using the per-block and per-rank results
+         const int rank = MPI::GetRank( group );
+         Scan< DeviceType, Type, PhaseType >::performSecondPhase( inputLocalView, outputLocalView, block_results, begin, end, begin, reduction, identity, rank_results[ rank ] );
+      }
+   }
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
index 5db002cdd48fdea73df91bf690590ba73de84ae9..e06ad4bee92640343289f95368bda9da68a2c258 100644
--- a/src/TNL/Algorithms/detail/Reduction.h
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -22,43 +22,12 @@ namespace TNL {
    namespace Algorithms {
       namespace detail {
 
-/**
- * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
- *
- * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
- * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
- * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
- * position of the smallest or the largest element, reduction with argument can be used.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- *
- * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
- */
 template< typename Device >
 struct Reduction;
 
 template<>
 struct Reduction< Devices::Sequential >
 {
-   using DeviceType = Devices::Sequential;
-
-   /**
-    * \brief Computes reduction on CPU sequentially.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -68,29 +37,8 @@ struct Reduction< Devices::Sequential >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -100,31 +48,12 @@ struct Reduction< Devices::Sequential >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
 template<>
 struct Reduction< Devices::Host >
 {
-   using DeviceType = Devices::Host;
-
-   /**
-    * \brief Computes reduction on CPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -134,29 +63,8 @@ struct Reduction< Devices::Host >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam ReductionOperation is a lambda function performing the reduction.
-    * \tparam DataFetcher is a lambda function for fetching the input data.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -166,30 +74,12 @@ struct Reduction< Devices::Host >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
 template<>
 struct Reduction< Devices::Cuda >
 {
-   using DeviceType = Devices::Cuda;
-
-   /**
-    * \brief Computes reduction on GPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -199,30 +89,8 @@ struct Reduction< Devices::Cuda >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& identity );
 
-   /**
-    * \brief Computes reduction on GPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    */
    template< typename Index,
              typename Result,
              typename Fetch,
@@ -232,7 +100,7 @@ struct Reduction< Devices::Cuda >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& identity );
 };
 
       } // namespace detail
diff --git a/src/TNL/Algorithms/detail/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp
index 0d1c8231f02507a963652b11194c684a0d299088..abd6c63f57cef9b3c73c70e1286d93c60e277d0d 100644
--- a/src/TNL/Algorithms/detail/Reduction.hpp
+++ b/src/TNL/Algorithms/detail/Reduction.hpp
@@ -16,7 +16,7 @@
 
 //#define CUDA_REDUCTION_PROFILING
 
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/detail/Reduction.h>
 #include <TNL/Algorithms/detail/CudaReductionKernel.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
@@ -46,7 +46,7 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
    constexpr int block_size = 128;
    const Index size = end - begin;
@@ -54,7 +54,7 @@ reduce( const Index begin,
 
    if( blocks > 1 ) {
       // initialize array for unrolled results
-      Result r[ 4 ] = { zero, zero, zero, zero };
+      Result r[ 4 ] = { identity, identity, identity, identity };
 
       // main reduce (explicitly unrolled loop)
       for( Index b = 0; b < blocks; b++ ) {
@@ -78,7 +78,7 @@ reduce( const Index begin,
       return r[ 0 ];
    }
    else {
-      Result result = zero;
+      Result result = identity;
       for( Index i = begin; i < end; i++ )
          result = reduce( result, fetch( i ) );
       return result;
@@ -95,7 +95,7 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
    constexpr int block_size = 128;
    const Index size = end - begin;
@@ -104,7 +104,7 @@ reduceWithArgument( const Index begin,
    if( blocks > 1 ) {
       // initialize array for unrolled results
       Index arg[ 4 ] = { 0, 0, 0, 0 };
-      Result r[ 4 ] = { zero, zero, zero, zero };
+      Result r[ 4 ] = { identity, identity, identity, identity };
       bool initialized( false );
 
       // main reduce (explicitly unrolled loop)
@@ -143,7 +143,7 @@ reduceWithArgument( const Index begin,
    }
    else if( begin >= end ) {
       // trivial case, fetch should not be called in this case
-      return std::make_pair( zero, end );
+      return std::make_pair( identity, end );
    }
    else {
       std::pair< Result, Index > result( fetch( begin ), begin );
@@ -163,7 +163,7 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
 #ifdef HAVE_OPENMP
    constexpr int block_size = 128;
@@ -172,12 +172,12 @@ reduce( const Index begin,
 
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
-      Result result = zero;
+      Result result = identity;
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
 #pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
-         Result r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { identity, identity, identity, identity  };
 
          #pragma omp for nowait
          for( Index b = 0; b < blocks; b++ ) {
@@ -212,7 +212,7 @@ reduce( const Index begin,
    }
    else
 #endif
-      return Reduction< Devices::Sequential >::reduce( begin, end, fetch, reduce, zero );
+      return Reduction< Devices::Sequential >::reduce( begin, end, fetch, reduce, identity );
 }
 
 template< typename Index,
@@ -225,7 +225,7 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
 #ifdef HAVE_OPENMP
    constexpr int block_size = 128;
@@ -234,13 +234,13 @@ reduceWithArgument( const Index begin,
 
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
-      std::pair< Result, Index > result( zero, -1 );
+      std::pair< Result, Index > result( identity, -1 );
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
 #pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
          Index arg[ 4 ] = { 0, 0, 0, 0 };
-         Result r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { identity, identity, identity, identity  };
          bool initialized( false );
 
          #pragma omp for nowait
@@ -290,7 +290,7 @@ reduceWithArgument( const Index begin,
    }
    else
 #endif
-      return Reduction< Devices::Sequential >::reduceWithArgument( begin, end, fetch, reduce, zero );
+      return Reduction< Devices::Sequential >::reduceWithArgument( begin, end, fetch, reduce, identity );
 }
 
 template< typename Index,
@@ -303,11 +303,11 @@ reduce( const Index begin,
         const Index end,
         Fetch&& fetch,
         Reduce&& reduce,
-        const Result& zero )
+        const Result& identity )
 {
    // trivial case, nothing to reduce
    if( begin >= end )
-      return zero;
+      return identity;
 
    // Only fundamental and pointer types can be safely reduced on host. Complex
    // objects stored on the device might contain pointers into the device memory,
@@ -327,7 +327,7 @@ reduce( const Index begin,
    const int reducedSize = reductionLauncher.start(
       reduce,
       fetch,
-      zero,
+      identity,
       deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
@@ -364,7 +364,7 @@ reduce( const Index begin,
 
       // finish the reduce on the host
       auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-      const Result result = Reduction< Devices::Sequential >::reduce( 0, reducedSize, fetch, reduce, zero );
+      const Result result = Reduction< Devices::Sequential >::reduce( 0, reducedSize, fetch, reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -374,7 +374,7 @@ reduce( const Index begin,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduce on the GPU
-      auto result = reductionLauncher.finish( reduce, zero );
+      auto result = reductionLauncher.finish( reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -397,11 +397,11 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero )
+                    const Result& identity )
 {
    // trivial case, nothing to reduce
    if( begin >= end )
-      return std::make_pair( zero, end );
+      return std::make_pair( identity, end );
 
    // Only fundamental and pointer types can be safely reduced on host. Complex
    // objects stored on the device might contain pointers into the device memory,
@@ -422,7 +422,7 @@ reduceWithArgument( const Index begin,
    const int reducedSize = reductionLauncher.startWithArgument(
       reduce,
       fetch,
-      zero,
+      identity,
       deviceAux1,
       deviceIndexes );
 
@@ -475,7 +475,7 @@ reduceWithArgument( const Index begin,
 
       // finish the reduce on the host
 //      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduce, fetch, zero );
+//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduce, fetch, identity );
       for( Index i = 1; i < reducedSize; i++ )
          reduce( resultArray[ 0 ], resultArray[ i ], indexArray[ 0 ], indexArray[ i ]  );
 
@@ -487,7 +487,7 @@ reduceWithArgument( const Index begin,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduce on the GPU
-      auto result = reductionLauncher.finishWithArgument( reduce, zero );
+      auto result = reductionLauncher.finishWithArgument( reduce, identity );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
diff --git a/src/TNL/Algorithms/detail/Scan.h b/src/TNL/Algorithms/detail/Scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a32452d94142b37908d30748f9c64d834d761ed
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Scan.h
@@ -0,0 +1,161 @@
+/***************************************************************************
+                          Scan.h  -  description
+                             -------------------
+    begin                : May 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/detail/ScanType.h>
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+template< typename Device, ScanType Type, ScanPhaseType PhaseType = ScanPhaseType::WriteInSecondPhase >
+struct Scan;
+
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Sequential, Type, PhaseType >
+{
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   // returns the last value of inclusive scan (reduction of the whole input)
+   static typename OutputArray::ValueType
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   static auto
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
+                       const BlockShifts& blockShifts,
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType identity,
+                       typename OutputArray::ValueType shift );
+};
+
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Host, Type, PhaseType >
+{
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   static void
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   static auto
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
+                       const BlockShifts& blockShifts,
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType identity,
+                       typename OutputArray::ValueType shift );
+};
+
+template< ScanType Type, ScanPhaseType PhaseType >
+struct Scan< Devices::Cuda, Type, PhaseType >
+{
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   static void
+   perform( const InputArray& input,
+            OutputArray& output,
+            typename InputArray::IndexType begin,
+            typename InputArray::IndexType end,
+            typename OutputArray::IndexType outputBegin,
+            Reduction&& reduction,
+            typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+   static auto
+   performFirstPhase( const InputArray& input,
+                      OutputArray& output,
+                      typename InputArray::IndexType begin,
+                      typename InputArray::IndexType end,
+                      typename OutputArray::IndexType outputBegin,
+                      Reduction&& reduction,
+                      typename OutputArray::ValueType identity );
+
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( const InputArray& input,
+                       OutputArray& output,
+                       const BlockShifts& blockShifts,
+                       typename InputArray::IndexType begin,
+                       typename InputArray::IndexType end,
+                       typename OutputArray::IndexType outputBegin,
+                       Reduction&& reduction,
+                       typename OutputArray::ValueType identity,
+                       typename OutputArray::ValueType shift );
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
+
+#include "Scan.hpp"
diff --git a/src/TNL/Algorithms/detail/Scan.hpp b/src/TNL/Algorithms/detail/Scan.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..383d2b5e2ed3dafaa69d886c6053966e95349103
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Scan.hpp
@@ -0,0 +1,471 @@
+/***************************************************************************
+                          Scan.hpp  -  description
+                             -------------------
+    begin                : Mar 24, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::forward
+
+#include "Scan.h"
+#include "CudaScanKernel.h"
+
+#include <TNL/Assert.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/StaticArray.h>
+#include <TNL/Algorithms/reduce.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+typename OutputArray::ValueType
+Scan< Devices::Sequential, Type, PhaseType >::
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType identity )
+{
+   using ValueType = typename OutputArray::ValueType;
+
+   // simple sequential algorithm - not split into phases
+   ValueType aux = identity;
+   if( Type == ScanType::Inclusive ) {
+      for( ; begin < end; begin++, outputBegin++ )
+         output[ outputBegin ] = aux = reduction( aux, input[ begin ] );
+   }
+   else // Exclusive scan
+   {
+      for( ; begin < end; begin++, outputBegin++ ) {
+         const ValueType x = input[ begin ];
+         output[ outputBegin ] = aux;
+         aux = reduction( aux, x );
+      }
+   }
+   // return the last value of inclusive scan (reduction of the whole input)
+   return aux;
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+auto
+Scan< Devices::Sequential, Type, PhaseType >::
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType identity )
+{
+   if( end <= begin ) {
+      Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 1 );
+      block_results.setValue( identity );
+      return block_results;
+   }
+
+   switch( PhaseType )
+   {
+      case ScanPhaseType::WriteInFirstPhase:
+      {
+         // artificial second phase - pre-scan the block
+         Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
+         block_results[ 0 ] = identity;
+         block_results[ 1 ] = perform( input, output, begin, end, outputBegin, reduction, identity );
+         return block_results;
+      }
+
+      case ScanPhaseType::WriteInSecondPhase:
+      {
+         // artificial first phase - only reduce the block
+         Containers::Array< typename OutputArray::ValueType, Devices::Sequential > block_results( 2 );
+         block_results[ 0 ] = identity;
+         block_results[ 1 ] = reduce< Devices::Sequential >( begin, end, input, reduction, identity );
+         return block_results;
+      }
+   };
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+void
+Scan< Devices::Sequential, Type, PhaseType >::
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
+                    const BlockShifts& blockShifts,
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType identity,
+                    typename OutputArray::ValueType shift )
+{
+   switch( PhaseType )
+   {
+      case ScanPhaseType::WriteInFirstPhase:
+      {
+         // artificial second phase - uniform shift of a pre-scanned block
+         shift = reduction( shift, blockShifts[ 0 ] );
+         typename InputArray::IndexType outputEnd = outputBegin + end - begin;
+         for( typename InputArray::IndexType i = outputBegin; i < outputEnd; i++ )
+            output[ i ] = reduction( output[ i ], shift );
+         break;
+      }
+
+      case ScanPhaseType::WriteInSecondPhase:
+      {
+         // artificial second phase - only one block, use the shift as the initial value
+         perform( input, output, begin, end, outputBegin, reduction, reduction( shift, blockShifts[ 0 ] ) );
+         break;
+      }
+   }
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+void
+Scan< Devices::Host, Type, PhaseType >::
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType identity )
+{
+#ifdef HAVE_OPENMP
+   using ValueType = typename OutputArray::ValueType;
+   using IndexType = typename InputArray::IndexType;
+
+   if( end <= begin )
+      return;
+
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
+
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      Containers::Array< ValueType > block_results( blocks + 1 );
+
+      #pragma omp parallel num_threads(threads)
+      {
+         const int block_idx = omp_get_thread_num();
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
+
+         switch( PhaseType )
+         {
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // step 1: pre-scan the block and save the result of the block reduction
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, identity );
+
+               #pragma omp barrier
+
+               // step 2: scan the block results
+               #pragma omp single
+               {
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
+               }
+
+               // step 3: uniform shift of the pre-scanned block
+               const ValueType block_shift = block_results[ block_idx ];
+               const IndexType block_output_end = block_output_begin + block_end - block_begin;
+               for( IndexType i = block_output_begin; i < block_output_end; i++ )
+                  output[ i ] = reduction( output[ i ], block_shift );
+
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // step 1: per-block reductions, write the result into the buffer
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, identity );
+
+               #pragma omp barrier
+
+               // step 2: scan the block results
+               #pragma omp single
+               {
+                  Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
+               }
+
+               // step 3: per-block scan using the block results as initial values
+               Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_results[ block_idx ] );
+
+               break;
+            }
+         }
+      }
+   }
+   else
+#endif
+      Scan< Devices::Sequential, Type >::perform( input, output, begin, end, outputBegin, reduction, identity );
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+auto
+Scan< Devices::Host, Type, PhaseType >::
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType identity )
+{
+#ifdef HAVE_OPENMP
+   using ValueType = typename OutputArray::ValueType;
+   using IndexType = typename InputArray::IndexType;
+
+   if( end <= begin ) {
+      Containers::Array< ValueType, Devices::Sequential > block_results( 1 );
+      block_results.setValue( identity );
+      return block_results;
+   }
+
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
+
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      Containers::Array< ValueType, Devices::Sequential > block_results( blocks + 1 );
+
+      #pragma omp parallel num_threads(threads)
+      {
+         const int block_idx = omp_get_thread_num();
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
+
+         switch( PhaseType )
+         {
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // pre-scan the block, write the result of the block reduction into the buffer
+               block_results[ block_idx ] = Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, identity );
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // upsweep: per-block reductions, write the result into the buffer
+               block_results[ block_idx ] = reduce< Devices::Sequential >( block_begin, block_end, input, reduction, identity );
+               break;
+            }
+         }
+      }
+
+      // spine step: scan the block results
+      Scan< Devices::Sequential, ScanType::Exclusive >::perform( block_results, block_results, 0, blocks + 1, 0, reduction, identity );
+
+      // block_results now contains shift values for each block - to be used in the second phase
+      return block_results;
+   }
+   else
+#endif
+      return Scan< Devices::Sequential, Type >::performFirstPhase( input, output, begin, end, outputBegin, reduction, identity );
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+void
+Scan< Devices::Host, Type, PhaseType >::
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
+                    const BlockShifts& blockShifts,
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType identity,
+                    typename OutputArray::ValueType shift )
+{
+#ifdef HAVE_OPENMP
+   using ValueType = typename OutputArray::ValueType;
+   using IndexType = typename InputArray::IndexType;
+
+   if( end <= begin )
+      return;
+
+   const IndexType size = end - begin;
+   const int max_threads = Devices::Host::getMaxThreadsCount();
+   const IndexType block_size = TNL::max( 1024, TNL::roundUpDivision( size, max_threads ) );
+   const IndexType blocks = TNL::roundUpDivision( size, block_size );
+
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+      #pragma omp parallel num_threads(threads)
+      {
+         const int block_idx = omp_get_thread_num();
+         const IndexType block_offset = block_idx * block_size;
+         const IndexType block_begin = begin + block_offset;
+         const IndexType block_end = TNL::min( block_begin + block_size, end );
+         const IndexType block_output_begin = outputBegin + block_offset;
+
+         const ValueType block_shift = reduction( shift, blockShifts[ block_idx ] );
+
+         switch( PhaseType )
+         {
+            case ScanPhaseType::WriteInFirstPhase:
+            {
+               // uniform shift of a pre-scanned block
+               const IndexType block_output_end = block_output_begin + block_end - block_begin;
+               for( IndexType i = block_output_begin; i < block_output_end; i++ )
+                  output[ i ] = reduction( output[ i ], block_shift );
+               break;
+            }
+
+            case ScanPhaseType::WriteInSecondPhase:
+            {
+               // downsweep: per-block scan using the block results as initial values
+               Scan< Devices::Sequential, Type >::perform( input, output, block_begin, block_end, block_output_begin, reduction, block_shift );
+               break;
+            }
+         }
+      }
+   }
+   else
+#endif
+      Scan< Devices::Sequential, Type >::performSecondPhase( input, output, blockShifts, begin, end, outputBegin, reduction, identity, shift );
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+void
+Scan< Devices::Cuda, Type, PhaseType >::
+perform( const InputArray& input,
+         OutputArray& output,
+         typename InputArray::IndexType begin,
+         typename InputArray::IndexType end,
+         typename OutputArray::IndexType outputBegin,
+         Reduction&& reduction,
+         typename OutputArray::ValueType identity )
+{
+#ifdef HAVE_CUDA
+   if( end <= begin )
+      return;
+
+   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform(
+      input,
+      output,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
+      identity );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename Reduction >
+auto
+Scan< Devices::Cuda, Type, PhaseType >::
+performFirstPhase( const InputArray& input,
+                   OutputArray& output,
+                   typename InputArray::IndexType begin,
+                   typename InputArray::IndexType end,
+                   typename OutputArray::IndexType outputBegin,
+                   Reduction&& reduction,
+                   typename OutputArray::ValueType identity )
+{
+#ifdef HAVE_CUDA
+   if( end <= begin ) {
+      Containers::Array< typename OutputArray::ValueType, Devices::Cuda > block_results( 1 );
+      block_results.setValue( identity );
+      return block_results;
+   }
+
+   return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase(
+      input,
+      output,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
+      identity );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< ScanType Type, ScanPhaseType PhaseType >
+   template< typename InputArray,
+             typename OutputArray,
+             typename BlockShifts,
+             typename Reduction >
+void
+Scan< Devices::Cuda, Type, PhaseType >::
+performSecondPhase( const InputArray& input,
+                    OutputArray& output,
+                    const BlockShifts& blockShifts,
+                    typename InputArray::IndexType begin,
+                    typename InputArray::IndexType end,
+                    typename OutputArray::IndexType outputBegin,
+                    Reduction&& reduction,
+                    typename OutputArray::ValueType identity,
+                    typename OutputArray::ValueType shift )
+{
+#ifdef HAVE_CUDA
+   if( end <= begin )
+      return;
+
+   detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase(
+      input,
+      output,
+      blockShifts,
+      begin,
+      end,
+      outputBegin,
+      std::forward< Reduction >( reduction ),
+      identity,
+      shift );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/detail/ScanType.h b/src/TNL/Algorithms/detail/ScanType.h
new file mode 100644
index 0000000000000000000000000000000000000000..6af414436a07bc4516159e82f7141d12add49f1d
--- /dev/null
+++ b/src/TNL/Algorithms/detail/ScanType.h
@@ -0,0 +1,31 @@
+/***************************************************************************
+                          ScanType.h  -  description
+                             -------------------
+    begin                : May 9, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+namespace TNL {
+namespace Algorithms {
+namespace detail {
+
+enum class ScanType {
+   Exclusive,
+   Inclusive
+};
+
+enum class ScanPhaseType {
+   WriteInFirstPhase,
+   WriteInSecondPhase
+};
+
+} // namespace detail
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/distributedScan.h b/src/TNL/Algorithms/distributedScan.h
new file mode 100644
index 0000000000000000000000000000000000000000..39724f10a054783662e2920d455a43647324f602
--- /dev/null
+++ b/src/TNL/Algorithms/distributedScan.h
@@ -0,0 +1,302 @@
+/***************************************************************************
+                          distributedScan.h  -  description
+                             -------------------
+    begin                : Jul 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::forward
+
+#include <TNL/Algorithms/detail/DistributedScan.h>
+#include <TNL/Functional.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction >
+void
+distributedInclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin,
+                          typename InputDistributedArray::IndexType end,
+                          Reduction&& reduction,
+                          typename OutputDistributedArray::ValueType identity )
+{
+   static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( input.getCommunicationGroup(), output.getCommunicationGroup(),
+                  "The input and output arrays must have the same MPI communicator." );
+   TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
+                  "The input and output arrays must have the same local range on all ranks." );
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
+   output.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The identity element is taken as `reduction.template getIdentity< typename OutputDistributedArray::ValueType >()`.
+ * See \ref distributedInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin = 0,
+                          typename InputDistributedArray::IndexType end = 0,
+                          Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputDistributedArray::ValueType identity = Reduction::template getIdentity< typename OutputDistributedArray::ValueType >();
+   distributedInclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction >
+void
+distributedExclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin,
+                          typename InputDistributedArray::IndexType end,
+                          Reduction&& reduction,
+                          typename OutputDistributedArray::ValueType identity )
+{
+   static_assert( std::is_same< typename InputDistributedArray::DeviceType, typename OutputDistributedArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( input.getCommunicationGroup(), output.getCommunicationGroup(),
+                  "The input and output arrays must have the same MPI communicator." );
+   TNL_ASSERT_EQ( input.getLocalRange(), output.getLocalRange(),
+                  "The input and output arrays must have the same local range on all ranks." );
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
+   Scan::perform( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
+   output.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The identity element is taken as `reduction.template getIdentity< typename OutputDistributedArray::ValueType >()`.
+ * See \ref distributedExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputDistributedArray,
+          typename OutputDistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedExclusiveScan( const InputDistributedArray& input,
+                          OutputDistributedArray& output,
+                          typename InputDistributedArray::IndexType begin = 0,
+                          typename InputDistributedArray::IndexType end = 0,
+                          Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputDistributedArray::ValueType identity = Reduction::template getIdentity< typename OutputDistributedArray::ValueType >();
+   distributedExclusiveScan( input, output, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename DistributedArray,
+          typename Reduction >
+void
+distributedInplaceInclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin,
+                                 typename DistributedArray::IndexType end,
+                                 Reduction&& reduction,
+                                 typename DistributedArray::ValueType identity )
+{
+   using Scan = detail::DistributedScan< detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), identity );
+   array.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInplaceInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The identity element is taken as `reduction.template getIdentity< typename DistributedArray::ValueType >()`.
+ * See \ref distributedInplaceInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename DistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInplaceInclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin = 0,
+                                 typename DistributedArray::IndexType end = 0,
+                                 Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename DistributedArray::ValueType identity = Reduction::template getIdentity< typename DistributedArray::ValueType >();
+   distributedInplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of a distributed array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam DistributedArray type of the distributed array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ */
+template< typename DistributedArray,
+          typename Reduction >
+void
+distributedInplaceExclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin,
+                                 typename DistributedArray::IndexType end,
+                                 Reduction&& reduction,
+                                 typename DistributedArray::ValueType identity )
+{
+   using Scan = detail::DistributedScan< detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
+   Scan::perform( array, array, begin, end, std::forward< Reduction >( reduction ), identity );
+   array.startSynchronization();
+}
+
+/**
+ * \brief Overload of \ref distributedInplaceExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The identity element is taken as `reduction.template getIdentity< typename DistributedArray::ValueType >()`.
+ * See \ref distributedInplaceExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename DistributedArray,
+          typename Reduction = TNL::Plus >
+void
+distributedInplaceExclusiveScan( DistributedArray& array,
+                                 typename DistributedArray::IndexType begin = 0,
+                                 typename DistributedArray::IndexType end = 0,
+                                 Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename DistributedArray::ValueType identity = Reduction::template getIdentity< typename DistributedArray::ValueType >();
+   distributedInplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/reduce.h b/src/TNL/Algorithms/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..a769a2e1adde0c4d8f5b6e861b09a0f829afbdb3
--- /dev/null
+++ b/src/TNL/Algorithms/reduce.h
@@ -0,0 +1,411 @@
+/***************************************************************************
+                          reduce.h  -  description
+                             -------------------
+    begin                : Oct 28, 2010
+    copyright            : (C) 2010 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::pair, std::forward
+
+#include <TNL/Functional.h>  // extension of STL functionals for reduction
+#include <TNL/Algorithms/detail/Reduction.h>
+#include <TNL/Containers/Expressions/TypeTraits.h>  // RemoveET
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief \e reduce implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern))
+ * for vectors and arrays.
+ *
+ * Reduction can be used for operations having one or more vectors (or arrays)
+ * elements as input and returning one number (or element) as output. Some
+ * examples of such operations can be vectors/arrays comparison, vector norm,
+ * scalar product of two vectors or computing minimum or maximum. If one needs
+ * to know even the position of the smallest or the largest element, the
+ * function \ref reduceWithArgument can be used.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduction is a lambda function performing the reduction.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduction is a lambda function defining the reduction operation.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithLambda.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithLambda.out
+ */
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduction >
+Result reduce( const Index begin,
+               const Index end,
+               Fetch&& fetch,
+               Reduction&& reduction,
+               const Result& identity )
+{
+   return detail::Reduction< Device >::reduce( begin,
+                                               end,
+                                               std::forward< Fetch >( fetch ),
+                                               std::forward< Reduction >( reduction ),
+                                               identity );
+}
+
+/**
+ * \brief Variant of \ref reduce with functional instead of reduction lambda function.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduction is a functional performing the reduction.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies,
+ * \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd, \ref TNL::LogicalOr,
+ * \ref TNL::BitAnd or \ref TNL::BitOr. \ref TNL::Plus is used by default.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduction is a lambda function defining the reduction operation.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithFunctional.out
+ */
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction = TNL::Plus >
+auto reduce( const Index begin,
+             const Index end,
+             Fetch&& fetch,
+             Reduction&& reduction = TNL::Plus{} )
+{
+   using Result = Containers::Expressions::RemoveET< decltype( reduction( fetch(0), fetch(0) ) ) >;
+   return reduce< Device >( begin,
+                            end,
+                            std::forward< Fetch >( fetch ),
+                            std::forward< Reduction >( reduction ),
+                            reduction.template getIdentity< Result >() );
+}
+
+/**
+ * \brief Variant of \ref reduce for arrays, views and compatible objects.
+ *
+ * The referenced \ref reduce function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and `identity` as the identity element of the reduction.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction,
+          typename Result >
+auto reduce( const Array& array,
+             Reduction&& reduction,
+             Result identity )
+{
+   return reduce< Device >( (typename Array::IndexType) 0,
+                            array.getSize(),
+                            array.getConstView(),
+                            std::forward< Reduction >( reduction ),
+                            identity );
+}
+
+/**
+ * \brief Variant of \ref reduce for arrays, views and compatible objects.
+ *
+ * \e Reduction can be one of the following \ref TNL::Plus, \ref TNL::Multiplies,
+ * \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd, \ref TNL::LogicalOr,
+ * \ref TNL::BitAnd or \ref TNL::BitOr. \ref TNL::Plus is used by default.
+ *
+ * The referenced \ref reduce function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and the identity element obtained from the reduction functional object.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction = TNL::Plus >
+auto reduce( const Array& array,
+             Reduction&& reduction = TNL::Plus{} )
+{
+   using Result = Containers::Expressions::RemoveET< decltype( reduction( array(0), array(0) ) ) >;
+   return reduce< Array, Device >( array,
+                                   std::forward< Reduction >( reduction ),
+                                   reduction.template getIdentity< Result >() );
+}
+
+/**
+ * \brief Variant of \ref reduce returning also the position of the element of interest.
+ *
+ * For example, in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. This method
+ * is, however, more flexible.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduction is a lambda function performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgument.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgument.out
+ */
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduction >
+std::pair< Result, Index >
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduction&& reduction,
+                    const Result& identity )
+{
+   return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                           end,
+                                                           std::forward< Fetch >( fetch ),
+                                                           std::forward< Reduction >( reduction ),
+                                                           identity );
+}
+
+/**
+ * \brief Variant of \ref reduceWithArgument with functional instead of reduction lambda function.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduction is a functional performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential,
+ * \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduction can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduction is a lambda function defining the reduction operation and managing the elements positions.
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first`
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduction` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgumentWithFunctional.out
+ */
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduction >
+auto
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduction&& reduction )
+{
+   using Result = Containers::Expressions::RemoveET< decltype( fetch(0) ) >;
+   return reduceWithArgument< Device >( begin,
+                                        end,
+                                        std::forward< Fetch >( fetch ),
+                                        std::forward< Reduction >( reduction ),
+                                        reduction.template getIdentity< Result >() );
+}
+
+/**
+ * \brief Variant of \ref reduceWithArgument for arrays, views and compatible objects.
+ *
+ * The referenced \ref reduceWithArgument function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and `identity` as the identity element of the reduction.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceWithArgumentArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceWithArgumentArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction,
+          typename Result >
+auto reduceWithArgument( const Array& array,
+                         Reduction&& reduction,
+                         Result identity )
+{
+   return reduceWithArgument< Device >( (typename Array::IndexType) 0,
+                                        array.getSize(),
+                                        array.getConstView(),
+                                        std::forward< Reduction >( reduction ),
+                                        identity );
+}
+
+/**
+ * \brief Variant of \ref reduceWithArgument for arrays, views and compatible objects.
+ *
+ * \e Reduction can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ *
+ * The referenced \ref reduceWithArgument function is called with:
+ *
+ * - `Device`, which is `typename Array::DeviceType` by default, as the `Device` type,
+ * - `0` as the beginning of the interval for reduction,
+ * - `array.getSize()` as the end of the interval for reduction,
+ * - `array.getConstView()` as the `fetch` functor,
+ * - `reduction` as the reduction operation,
+ * - and the identity element obtained from the reduction functional object.
+ *
+ * \par Example
+ *
+ * \include Algorithms/reduceWithArgumentArrayExample.cpp
+ *
+ * \par Output
+ *
+ * \include reduceWithArgumentArrayExample.out
+ */
+template< typename Array,
+          typename Device = typename Array::DeviceType,
+          typename Reduction >
+auto reduceWithArgument( const Array& array,
+                         Reduction&& reduction )
+{
+   using Result = Containers::Expressions::RemoveET< decltype( array(0) ) >;
+   return reduceWithArgument< Array, Device >( array,
+                                               std::forward< Reduction >( reduction ),
+                                               reduction.template getIdentity< Result >() );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Algorithms/scan.h b/src/TNL/Algorithms/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..30eb5ddc2d6f7e19cdd17aa631ce8cf0e6fa8f47
--- /dev/null
+++ b/src/TNL/Algorithms/scan.h
@@ -0,0 +1,350 @@
+/***************************************************************************
+                          scan.h  -  description
+                             -------------------
+    begin                : Jul 11, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>  // std::forward
+
+#include <TNL/Algorithms/detail/Scan.h>
+#include <TNL/Functional.h>
+
+namespace TNL {
+namespace Algorithms {
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of an input array and
+ *        stores it in an output array.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam InputArray type of the array to be scanned
+ * \tparam OutputArray type of the output array
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param input the input array to be scanned
+ * \param output the array where the result will be stored
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param outputBegin the first element in the output array to be written. There
+ *                    must be at least `end - begin` elements in the output
+ *                    array starting at the position given by `outputBegin`.
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inclusiveScanExample.out
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction >
+void
+inclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin,
+               typename InputArray::IndexType end,
+               typename OutputArray::IndexType outputBegin,
+               Reduction&& reduction,
+               typename OutputArray::ValueType identity )
+{
+   static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInFirstPhase >;
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Overload of \ref inclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename OutputArray::ValueType >()`.
+ * See \ref inclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction = TNL::Plus >
+void
+inclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin = 0,
+               typename InputArray::IndexType end = 0,
+               typename OutputArray::IndexType outputBegin = 0,
+               Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputArray::ValueType identity = Reduction::template getIdentity< typename OutputArray::ValueType >();
+   inclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of an input array and
+ *        stores it in an output array.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam InputArray type of the array to be scanned
+ * \tparam OutputArray type of the output array
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param input the input array to be scanned
+ * \param output the array where the result will be stored
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param outputBegin the first element in the output array to be written. There
+ *                    must be at least `end - begin` elements in the output
+ *                    array starting at the position given by `outputBegin`.
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/exclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include exclusiveScanExample.out
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction >
+void
+exclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin,
+               typename InputArray::IndexType end,
+               typename OutputArray::IndexType outputBegin,
+               Reduction&& reduction,
+               typename OutputArray::ValueType identity )
+{
+   static_assert( std::is_same< typename InputArray::DeviceType, typename OutputArray::DeviceType >::value,
+                  "The input and output arrays must have the same device type." );
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
+   // TODO: check if evaluating the input is expensive (e.g. a vector expression), otherwise use WriteInSecondPhase (optimal for array-to-array)
+   using Scan = detail::Scan< typename OutputArray::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInFirstPhase >;
+   Scan::perform( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Overload of \ref exclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename OutputArray::ValueType >()`.
+ * See \ref exclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `input.getSize()`.
+ */
+template< typename InputArray,
+          typename OutputArray,
+          typename Reduction = TNL::Plus >
+void
+exclusiveScan( const InputArray& input,
+               OutputArray& output,
+               typename InputArray::IndexType begin = 0,
+               typename InputArray::IndexType end = 0,
+               typename OutputArray::IndexType outputBegin = 0,
+               Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = input.getSize();
+   constexpr typename OutputArray::ValueType identity = Reduction::template getIdentity< typename OutputArray::ValueType >();
+   exclusiveScan( input, output, begin, end, outputBegin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an inclusive scan (or prefix sum) of an array in-place.
+ *
+ * [Inclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$s_1, \ldots, s_n\f$ defined as
+ *
+ * \f[
+ * s_i = \sum_{j=1}^i a_i.
+ * \f]
+ *
+ * \tparam Array type of the array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inplaceInclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inplaceInclusiveScanExample.out
+ */
+template< typename Array,
+          typename Reduction >
+void
+inplaceInclusiveScan( Array& array,
+                      typename Array::IndexType begin,
+                      typename Array::IndexType end,
+                      Reduction&& reduction,
+                      typename Array::ValueType identity )
+{
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Inclusive, detail::ScanPhaseType::WriteInSecondPhase >;
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Overload of \ref inplaceInclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename Array::ValueType >()`.
+ * See \ref inplaceInclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename Array,
+          typename Reduction = TNL::Plus >
+void
+inplaceInclusiveScan( Array& array,
+                      typename Array::IndexType begin = 0,
+                      typename Array::IndexType end = 0,
+                      Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename Array::ValueType identity = Reduction::template getIdentity< typename Array::ValueType >();
+   inplaceInclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Computes an exclusive scan (or prefix sum) of an array in-place.
+ *
+ * [Exclusive scan (or prefix sum)](https://en.wikipedia.org/wiki/Prefix_sum)
+ * operation turns a sequence \f$a_1, \ldots, a_n\f$ into a sequence
+ * \f$\sigma_1, \ldots, \sigma_n\f$ defined as
+ *
+ * \f[
+ * \sigma_i = \sum_{j=1}^{i-1} a_i.
+ * \f]
+ *
+ * \tparam Array type of the array to be scanned
+ * \tparam Reduction type of the reduction functor
+ *
+ * \param array input array, the result of scan is stored in the same array
+ * \param begin the first element in the array to be scanned
+ * \param end the last element in the array to be scanned
+ * \param reduction functor implementing the reduction operation
+ * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+ *                 for the reduction operation, i.e. element which does not
+ *                 change the result of the reduction.
+ *
+ * The reduction functor takes two variables to be reduced:
+ *
+ * ```
+ * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/inplaceExclusiveScanExample.cpp
+ *
+ * \par Output
+ *
+ * \include inplaceExclusiveScanExample.out
+ */
+template< typename Array,
+          typename Reduction >
+void
+inplaceExclusiveScan( Array& array,
+                      typename Array::IndexType begin,
+                      typename Array::IndexType end,
+                      Reduction&& reduction,
+                      typename Array::ValueType identity )
+{
+   TNL_ASSERT_EQ( reduction( identity, identity ), identity,
+                  "identity is not an identity element of the reduction operation" );
+   using Scan = detail::Scan< typename Array::DeviceType, detail::ScanType::Exclusive, detail::ScanPhaseType::WriteInSecondPhase >;
+   Scan::perform( array, array, begin, end, begin, std::forward< Reduction >( reduction ), identity );
+}
+
+/**
+ * \brief Overload of \ref inplaceExclusiveScan which uses a TNL functional
+ *        object for reduction. \ref TNL::Plus is used by default.
+ *
+ * The [identity element](https://en.wikipedia.org/wiki/Identity_element) is
+ * taken as `reduction.template getIdentity< typename Array::ValueType >()`.
+ * See \ref inplaceExclusiveScan for the explanation of other parameters.
+ * Note that when `end` equals 0 (the default), it is set to `array.getSize()`.
+ */
+template< typename Array,
+          typename Reduction = TNL::Plus >
+void
+inplaceExclusiveScan( Array& array,
+                      typename Array::IndexType begin = 0,
+                      typename Array::IndexType end = 0,
+                      Reduction&& reduction = TNL::Plus{} )
+{
+   if( end == 0 )
+      end = array.getSize();
+   constexpr typename Array::ValueType identity = Reduction::template getIdentity< typename Array::ValueType >();
+   inplaceExclusiveScan( array, begin, end, std::forward< Reduction >( reduction ), identity );
+}
+
+} // namespace Algorithms
+} // namespace TNL
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index f2c9ca705465c56287f397bb1417777f6f12f4ae..7d00683a36b9824bed91e7a91538a55505d465f8 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -56,8 +56,7 @@ template< int, typename > class StaticArray;
  * explicit data transfer which is not buffered, so it can be very slow.
  *
  * Other methods, such as \ref operator=, \ref operator==, \ref operator!=,
- * \ref setValue, \ref containsValue, \ref containsOnlyValue, and \ref evaluate,
- * provide various operations on whole arrays.
+ * \ref setValue, and \ref evaluate, provide various operations on whole arrays.
  *
  * See also \ref ArrayView, \ref Vector, \ref VectorView.
  *
@@ -498,6 +497,20 @@ class Array
        */
       __cuda_callable__ const Value& operator[]( IndexType i ) const;
 
+      /**
+       * \brief Accesses the \e i-th element of the array.
+       *
+       * Equivalent to \ref operator[], with the same notes and caveats.
+       */
+      __cuda_callable__ Value& operator()( IndexType i );
+
+      /**
+       * \brief Accesses the \e i-th element of the array.
+       *
+       * Equivalent to \ref operator[], with the same notes and caveats.
+       */
+      __cuda_callable__ const Value& operator()( IndexType i ) const;
+
       /**
        * \brief Copy-assignment operator for copying data from another array.
        *
@@ -708,192 +721,6 @@ class Array
       template< typename Function >
       void forAllElements( Function&& f ) const;
 
-       /**
-        * \brief Computes reduction with array elements on interval [ \e begin, \e end).
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-        * \param end defines range [begin, end) of indexes which will be used for the reduction.
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-       /**
-        * \brief Computes reduction with array elements on interval [ \e begin, \e end) for constant instances.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-        * \param end defines range [begin, end) of indexes which will be used for the reduction.
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-       /**
-        * \brief Computes reduction with all array elements.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-       /**
-        * \brief Computes reduction with all array elements for constant instances.
-        *
-        * \tparam Fetche is a lambda function for fetching the input data.
-        * \tparam Reduce is a lambda function performing the reduction.
-        * \tparam Result is a type of the reduction result.
-        *
-        * \param fetch is a lambda function fetching the input data.
-        * \param reduce is a lambda function defining the reduction operation.
-        * \param zero is the idempotent element for the reduction operation, i.e. element which
-        *             does not change the result of the reduction.
-        * \return result of the reduction
-        *
-        * The \e Fetch lambda function takes two arguments which are index and value of the element
-        * being currently processed:
-        *
-        * ```
-        * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-        * ```
-        *
-        * The reduction lambda function takes two variables which are supposed to be reduced:
-        *
-        * ```
-        * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-        * ```
-        *
-        * \par Example
-        * \include Containers/ArrayExample_reduceElements.cpp
-        * \par Output
-        * \include ArrayExample.out
-        */
-      template< typename Fetch,
-                typename Reduce,
-                typename Result >
-      Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-      /**
-       * \brief Checks if there is an element with value \e v.
-       *
-       * By default, all elements of the array are checked. If \e begin or
-       * \e end is set to a non-zero value, only elements in the sub-interval
-       * `[begin, end)` are checked.
-       *
-       * \param value The value to be checked.
-       * \param begin The beginning of the array sub-interval. It is 0 by
-       *              default.
-       * \param end The end of the array sub-interval. The default value is 0
-       *            which is, however, replaced with the array size.
-       * \return `true` if there is _at least one_ element in the sub-interval
-       *         `[begin, end)` which has the value \e value.
-       */
-      bool containsValue( ValueType value,
-                          IndexType begin = 0,
-                          IndexType end = 0 ) const;
-
-      /**
-       * \brief Checks if all elements have the same value \e v.
-       *
-       * By default, all elements of the array are checked. If \e begin or
-       * \e end is set to a non-zero value, only elements in the sub-interval
-       * `[begin, end)` are checked.
-       *
-       * \param value The value to be checked.
-       * \param begin The beginning of the array sub-interval. It is 0 by
-       *              default.
-       * \param end The end of the array sub-interval. The default value is 0
-       *            which is, however, replaced with the array size.
-       * \return `true` if _all_ elements in the sub-interval `[begin, end)`
-       *         have the same value \e value.
-       */
-      bool containsOnlyValue( ValueType value,
-                              IndexType begin = 0,
-                              IndexType end = 0 ) const;
-
       /**
        * \brief Method for saving the array to a binary file \e fileName.
        *
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index d935840ff8d5524be68ff94fc70b674a903af7b3..e01566e50088f9f994a906267787534e763ff407 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          Array_impl.h  -  description
+                          Array.hpp  -  description
                              -------------------
     begin                : Nov 8, 2012
     copyright            : (C) 2012 by Tomas Oberhuber
@@ -555,6 +555,30 @@ operator[]( IndexType i ) const
    return this->data[ i ];
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+__cuda_callable__
+Value&
+Array< Value, Device, Index, Allocator >::
+operator()( IndexType i )
+{
+   return operator[]( i );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+__cuda_callable__
+const Value&
+Array< Value, Device, Index, Allocator >::
+operator()( IndexType i ) const
+{
+   return operator[]( i );
+}
+
 template< typename Value,
           typename Device,
           typename Index,
@@ -732,96 +756,6 @@ forAllElements( Function&& f ) const
    view.forAllElements( f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->getView().reduceElements( begin, end, fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->getConstView().reduceElements( begin, end, fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->getView().reduceEachElement( fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< typename Fetch,
-         typename Reduce,
-         typename Result >
-Result
-Array< Value, Device, Index, Allocator >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->getConstView().reduceEachElement( fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-Array< Value, Device, Index, Allocator >::
-containsValue( ValueType value,
-               IndexType begin,
-               IndexType end ) const
-{
-   TNL_ASSERT_TRUE( this->getData(), "Attempted to check a value of an empty array." );
-   if( end == 0 )
-      end = this->getSize();
-
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-Array< Value, Device, Index, Allocator >::
-containsOnlyValue( ValueType value,
-                   IndexType begin,
-                   IndexType end ) const
-{
-   TNL_ASSERT_TRUE( this->getData(), "Attempted to check a value of an empty array." );
-   if( end == 0 )
-      end = this->getSize();
-
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
-}
-
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 31743c1f60ac4f6ed243b06e1e856c21565bf11e..b3e2416e000d9cbe6c6e59cd0bb3040ac1e01ff1 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -370,6 +370,20 @@ public:
    __cuda_callable__
    const Value& operator[]( IndexType i ) const;
 
+   /**
+    * \brief Accesses the \e i-th element of the array.
+    *
+    * Equivalent to \ref operator[], with the same notes and caveats.
+    */
+   __cuda_callable__ Value& operator()( IndexType i );
+
+   /**
+    * \brief Accesses the \e i-th element of the array.
+    *
+    * Equivalent to \ref operator[], with the same notes and caveats.
+    */
+   __cuda_callable__ const Value& operator()( IndexType i ) const;
+
    /**
     * \brief Compares the array view with another array-like container.
     *
@@ -527,192 +541,6 @@ public:
    template< typename Function >
    void forAllElements( Function&& f ) const;
 
-   /**
-    * \brief Computes reduction with array view elements on interval [ \e begin, \e end).
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-   /**
-    * \brief Computes reduction with array view elements on interval [ \e begin, \e end) for constant instances.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-   /**
-    * \brief Computes reduction with all array view elements.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero );
-
-   /**
-    * \brief Computes reduction with all array view elements for constant instances.
-    *
-    * \tparam Fetche is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    * \tparam Result is a type of the reduction result.
-    *
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The \e Fetch lambda function takes two arguments which are index and value of the element
-    * being currently processed:
-    *
-    * ```
-    * auto dataFetcher1 = [=] __cuda_callable__ ( IndexType idx, Value& value ) -> Result { return ... };
-    * ```
-    *
-    * The reduction lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    * \include Containers/ArrayViewExample_reduceElements.cpp
-    * \par Output
-    * \include ArrayViewExample_reduceElements.out
-    */
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-   Result reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const;
-
-   /**
-    * \brief Checks if there is an element with value \e v.
-    *
-    * By default, all elements of the array view are checked. If \e begin or
-    * \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are checked.
-    *
-    * \param value The value to be checked.
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array view size.
-    * \return `true` if there is _at least one_ element in the sub-interval
-    *         `[begin, end)` which has the value \e value.
-    */
-   bool containsValue( ValueType value,
-                       IndexType begin = 0,
-                       IndexType end = 0 ) const;
-
-   /**
-    * \brief Checks if all elements have the same value \e v.
-    *
-    * By default, all elements of the array view are checked. If \e begin or
-    * \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are checked.
-    *
-    * \param value The value to be checked.
-    * \param begin The beginning of the array view sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the array view sub-interval. The default value is 0
-    *            which is, however, replaced with the array view size.
-    * \return `true` if _all_ elements in the sub-interval `[begin, end)`
-    *         have the same value \e value.
-    */
-   bool containsOnlyValue( ValueType value,
-                           IndexType begin = 0,
-                           IndexType end = 0 ) const;
-
    /**
     * \brief Method for saving the data to a binary file \e fileName.
     *
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 8f6b446fe4e28673f704afb01ef51bcecb7aabe6..7771f7dc6a3d4b756f5fd920cae2329743c51483 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -1,5 +1,5 @@
 /***************************************************************************
-                          ArrayView_impl.h  -  description
+                          ArrayView.hpp  -  description
                              -------------------
     begin                : Sep 1, 2018
     copyright            : (C) 2018 by Tomas Oberhuber et al.
@@ -273,6 +273,28 @@ operator[]( IndexType i ) const
    return data[ i ];
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+__cuda_callable__
+Value&
+ArrayView< Value, Device, Index >::
+operator()( IndexType i )
+{
+   return operator[]( i );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+__cuda_callable__
+const Value&
+ArrayView< Value, Device, Index >::
+operator()( IndexType i ) const
+{
+   return operator[]( i );
+}
+
 template< typename Value,
           typename Device,
           typename Index >
@@ -374,96 +396,6 @@ forAllElements( Function&& f ) const
    this->forElements( 0, this->getSize(), f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   if( ! this->data )
-      return zero;
-
-   ValueType* d = this->getData();
-   auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   if( ! this->data )
-      return;
-
-   const ValueType* d = this->getData();
-   auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero )
-{
-   return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-   template< typename Fetch,
-             typename Reduce,
-             typename Result >
-Result
-ArrayView< Value, Device, Index >::
-reduceEachElement( Fetch&& fetch, Reduce&& reduce, const Result& zero ) const
-{
-   return this->reduceElements( 0, this->getSize(), fetch, reduce, zero );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-ArrayView< Value, Device, Index >::
-containsValue( ValueType value,
-               IndexType begin,
-               IndexType end ) const
-{
-   if( end == 0 )
-      end = this->getSize();
-   return Algorithms::MemoryOperations< Device >::containsValue( &this->getData()[ begin ], end - begin, value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-ArrayView< Value, Device, Index >::
-containsOnlyValue( ValueType value,
-                   IndexType begin,
-                   IndexType end ) const
-{
-   if( end == 0 )
-      end = this->getSize();
-   return Algorithms::MemoryOperations< Device >::containsOnlyValue( &this->getData()[ begin ], end - begin, value );
-}
-
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 15d8eaa53c98271d4a17019c000cfc70b14dd9ea..2c2690acd64f0ef80f13b68f2516f45d28b34020 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -254,12 +254,6 @@ public:
       void forElements( IndexType begin, IndexType end, Function&& f ) const;
 
 
-   // Checks if there is an element with given value in this array
-   bool containsValue( ValueType value ) const;
-
-   // Checks if all elements in this array have the same given value
-   bool containsOnlyValue( ValueType value ) const;
-
    // TODO: serialization (save, load)
 
 protected:
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index dcfaeee2d01929e7d339eda4d681a7907aaf0439..bda82c8bd063a22c2423485d0e3a7fc964f88d77 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -473,27 +473,5 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
    this->view.forElements( begin, end, f );
 }
 
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-DistributedArray< Value, Device, Index, Allocator >::
-containsValue( ValueType value ) const
-{
-   return view.containsValue( value );
-}
-
-template< typename Value,
-          typename Device,
-          typename Index,
-          typename Allocator >
-bool
-DistributedArray< Value, Device, Index, Allocator >::
-containsOnlyValue( ValueType value ) const
-{
-   return view.containsOnlyValue( value );
-}
-
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index 9da306744f3e25f660b4fe80065308aae25b3b68..b99d08076bcb43a53e878cfca9c32ca708549fff 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -230,12 +230,6 @@ public:
       template< typename Function >
       void forElements( IndexType begin, IndexType end, Function&& f ) const;
 
-   // Checks if there is an element with given value in this array
-   bool containsValue( ValueType value ) const;
-
-   // Checks if all elements in this array have the same given value
-   bool containsOnlyValue( ValueType value ) const;
-
    std::ostream& print( std::ostream& str ) const;
 protected:
    LocalRangeType localRange;
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 223ea99c869c36b10f858769440b6eb322075515..cb9edba19a262f6c6f2b540fa76079ebd0507768 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -375,10 +375,13 @@ operator=( const DistributedArrayView& view )
    TNL_ASSERT_EQ( getLocalRange(), view.getLocalRange(), "The local ranges must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getGhosts(), view.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), view.getCommunicationGroup(), "The communication groups of the array views must be equal." );
-   localData = view.getConstLocalViewWithGhosts();
-   // set, but do not unset, the synchronizer
-   if( view.getSynchronizer() )
-      setSynchronizer( view.getSynchronizer(), view.getValuesPerElement() );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      view.waitForSynchronization();
+      getLocalViewWithGhosts() = view.getConstLocalViewWithGhosts();
+   }
    return *this;
 }
 
@@ -394,10 +397,13 @@ operator=( const Array& array )
    TNL_ASSERT_EQ( getLocalRange(), array.getLocalRange(), "The local ranges must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getGhosts(), array.getGhosts(), "Ghosts must be equal, views are not resizable." );
    TNL_ASSERT_EQ( getCommunicationGroup(), array.getCommunicationGroup(), "The communication groups must be equal." );
-   localData = array.getConstLocalViewWithGhosts();
-   // set, but do not unset, the synchronizer
-   if( array.getSynchronizer() )
-      setSynchronizer( array.getSynchronizer(), array.getValuesPerElement() );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      array.waitForSynchronization();
+      getLocalViewWithGhosts() = array.getConstLocalViewWithGhosts();
+   }
    return *this;
 }
 
@@ -466,36 +472,6 @@ forElements( IndexType begin, IndexType end, Function&& f ) const
 }
 
 
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-DistributedArrayView< Value, Device, Index >::
-containsValue( ValueType value ) const
-{
-   bool result = false;
-   if( group != MPI::NullGroup() ) {
-      const bool localResult = localData.containsValue( value );
-      MPI::Allreduce( &localResult, &result, 1, MPI_LOR, group );
-   }
-   return result;
-}
-
-template< typename Value,
-          typename Device,
-          typename Index >
-bool
-DistributedArrayView< Value, Device, Index >::
-containsOnlyValue( ValueType value ) const
-{
-   bool result = true;
-   if( group != MPI::NullGroup() ) {
-      const bool localResult = localData.containsOnlyValue( value );
-      MPI::Allreduce( &localResult, &result, 1, MPI_LAND, group );
-   }
-   return result;
-}
-
 template< typename Value,
           typename Device,
           typename Index >
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 8d737e3a975b5d4c91451bff93357f59b5864bbe..f3b53f56fa1a18679aa6c1b2c9b2227b36383eed 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -150,6 +150,11 @@ public:
              typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
    DistributedVector& operator/=( Scalar c );
 
+   template< typename Scalar,
+             typename...,
+             typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
+   DistributedVector& operator%=( Scalar c );
+
    template< typename Vector,
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
@@ -175,8 +180,10 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVector& operator/=( const Vector& vector );
 
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
+   template< typename Vector,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
+   DistributedVector& operator%=( const Vector& vector );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index 044b747d9f42d148b17b1acb30917b5cdf04887c..2af5eab2c84b7a5cacedcb64096b7301b45ee5cd 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVector.h"
-#include <TNL/Algorithms/DistributedScan.h>
 
 namespace TNL {
 namespace Containers {
@@ -185,6 +184,19 @@ operator/=( const Vector& vector )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Vector, typename..., typename >
+DistributedVector< Real, Device, Index, Allocator >&
+DistributedVector< Real, Device, Index, Allocator >::
+operator%=( const Vector& vector )
+{
+   getView() %= vector;
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index,
@@ -254,12 +266,13 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Allocator >
-   template< Algorithms::ScanType Type >
-void
+   template< typename Scalar, typename..., typename >
+DistributedVector< Real, Device, Index, Allocator >&
 DistributedVector< Real, Device, Index, Allocator >::
-scan( IndexType begin, IndexType end )
+operator%=( Scalar c )
 {
-   getView().template scan< Type >( begin, end );
+   getView() %= c;
+   return *this;
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 4a46a47cec4eba56de0e6078ccc4557a52d37177..ee9bb287ff7856c431e4e99af7c272e82a33c205 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -121,6 +121,11 @@ public:
              typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
    DistributedVectorView& operator/=( Scalar c );
 
+   template< typename Scalar,
+             typename...,
+             typename = std::enable_if_t< ! HasSubscriptOperator<Scalar>::value > >
+   DistributedVectorView& operator%=( Scalar c );
+
    template< typename Vector,
              typename...,
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
@@ -146,8 +151,10 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVectorView& operator/=( const Vector& vector );
 
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
+   template< typename Vector,
+             typename...,
+             typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
+   DistributedVectorView& operator%=( const Vector& vector );
 };
 
 // Enable expression templates for DistributedVector
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 2f9222f94efb579d3a39c803d5685283fee03b33..69ad4c74b93e53e907d9ed76689572660cf553f8 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVectorView.h"
-#include <TNL/Algorithms/DistributedScan.h>
 
 namespace TNL {
 namespace Containers {
@@ -213,6 +212,32 @@ operator/=( const Vector& vector )
    return *this;
 }
 
+template< typename Real,
+          typename Device,
+          typename Index >
+   template< typename Vector, typename..., typename >
+DistributedVectorView< Real, Device, Index >&
+DistributedVectorView< Real, Device, Index >::
+operator%=( const Vector& vector )
+{
+   TNL_ASSERT_EQ( this->getSize(), vector.getSize(),
+                  "Vector sizes must be equal." );
+   TNL_ASSERT_EQ( this->getLocalRange(), vector.getLocalRange(),
+                  "Multiary operations are supported only on vectors which are distributed the same way." );
+   TNL_ASSERT_EQ( this->getGhosts(), vector.getGhosts(),
+                  "Ghosts must be equal, views are not resizable." );
+   TNL_ASSERT_EQ( this->getCommunicationGroup(), vector.getCommunicationGroup(),
+                  "Multiary operations are supported only on vectors within the same communication group." );
+
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      // TODO: it might be better to split the local and ghost parts and synchronize in the middle
+      this->waitForSynchronization();
+      vector.waitForSynchronization();
+      getLocalViewWithGhosts() %= vector.getConstLocalViewWithGhosts();
+   }
+   return *this;
+}
+
 template< typename Real,
           typename Device,
           typename Index >
@@ -291,15 +316,16 @@ operator/=( Scalar c )
 template< typename Real,
           typename Device,
           typename Index >
-   template< Algorithms::ScanType Type >
-void
+   template< typename Scalar, typename..., typename >
+DistributedVectorView< Real, Device, Index >&
 DistributedVectorView< Real, Device, Index >::
-scan( IndexType begin, IndexType end )
+operator%=( Scalar c )
 {
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::DistributedScan< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-   this->startSynchronization();
+   if( this->getCommunicationGroup() != MPI::NullGroup() ) {
+      getLocalView() %= c;
+      this->startSynchronization();
+   }
+   return *this;
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index 65f299120f180efcf2ff5897b1319fef473c1c3f..79d1a61d21660de6884fb82585b420aee065dd8f 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
 namespace TNL {
@@ -56,8 +56,8 @@ struct VectorComparison< T1, T2, false >
 {
    static bool EQ( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
 
       if( a.getSize() != b.getSize() )
          return false;
@@ -90,8 +90,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool GT( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -105,8 +105,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool GE( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -120,8 +120,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool LT( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
@@ -135,8 +135,8 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
    static bool LE( const T1& a, const T2& b )
    {
-      if( ! std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value )
-         throw std::runtime_error( "Cannot compare two expressions with different DeviceType." );
+      static_assert( std::is_same< typename T1::DeviceType, typename T2::DeviceType >::value,
+                     "Cannot compare two expressions with different DeviceType." );
       TNL_ASSERT_EQ( a.getSize(), b.getSize(), "Sizes of expressions to be compared do not fit." );
 
       using DeviceType = typename T1::DeviceType;
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index e257399f6670fd81d155d51312a76ecce8eeb38c..a713b00d48f5696bd470c1f2266c55c0fdfe672b 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -56,7 +56,8 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
@@ -154,7 +155,8 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
@@ -235,7 +237,8 @@ template< typename T1,
           typename Operation >
 struct DistributedBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
    using LocalRangeType = typename T2::LocalRangeType;
@@ -317,7 +320,8 @@ template< typename T1,
           typename Operation >
 struct DistributedUnaryExpressionTemplate
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using LocalRangeType = typename T1::LocalRangeType;
@@ -394,50 +398,86 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
+#define TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION(fname, functor)                                \
+   template< typename ET1,                                                                   \
+             typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >         \
+   auto                                                                                      \
+   fname( const ET1& a )                                                                     \
+   {                                                                                         \
+      return DistributedUnaryExpressionTemplate< ET1, functor >( a );                        \
+   }                                                                                         \
+
+#define TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1, typename ET2,                                                     \
+             typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >   \
+   auto                                                                                      \
+   fname( const ET1& a, const ET2& b )                                                       \
+   {                                                                                         \
+      return DistributedBinaryExpressionTemplate< ET1, ET2, functor >( a, b );               \
+   }                                                                                         \
+
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_DISTRIBUTED_UNARY_EXPRESSION
+#undef TNL_MAKE_DISTRIBUTED_BINARY_EXPRESSION
 
 ////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
-
-////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return DistributedBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return DistributedUnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator==( const ET1& a, const ET2& b )
 {
@@ -447,7 +487,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator!=( const ET1& a, const ET2& b )
 {
@@ -457,7 +497,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<( const ET1& a, const ET2& b )
 {
@@ -467,7 +507,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<=( const ET1& a, const ET2& b )
 {
@@ -477,7 +517,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>( const ET1& a, const ET2& b )
 {
@@ -487,7 +527,7 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>=( const ET1& a, const ET2& b )
 {
@@ -497,7 +537,7 @@ operator>=( const ET1& a, const ET2& b )
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 operator,( const ET1& a, const ET2& b )
 {
@@ -505,291 +545,17 @@ operator,( const ET1& a, const ET2& b )
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
+          typename..., EnableIfDistributedBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 dot( const ET1& a, const ET2& b )
 {
    return (a, b);
 }
 
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-operator-( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Minus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-min( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfDistributedBinaryExpression_t< ET1, ET2 >, typename = void, typename = void >
-auto
-max( const ET1& a, const ET2& b )
-{
-   return DistributedBinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-abs( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return DistributedBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-exp( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sqrt( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cbrt( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log10( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-log2( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sin( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cos( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-tan( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-asin( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-acos( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-atan( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sinh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-cosh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-tanh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-asinh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-acosh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-atanh( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-floor( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-ceil( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
-auto
-sign( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation,
-          typename = void, typename = void >
-auto
-cast( const ET1& a )
-{
-   return DistributedUnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 min( const ET1& a )
 {
@@ -797,7 +563,7 @@ min( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 argMin( const ET1& a )
 {
@@ -805,7 +571,7 @@ argMin( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 max( const ET1& a )
 {
@@ -813,7 +579,7 @@ max( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 argMax( const ET1& a )
 {
@@ -821,7 +587,7 @@ argMax( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 sum( const ET1& a )
 {
@@ -829,7 +595,7 @@ sum( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 maxNorm( const ET1& a )
 {
@@ -837,7 +603,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 l1Norm( const ET1& a )
 {
@@ -845,7 +611,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 l2Norm( const ET1& a )
 {
@@ -855,7 +621,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 lpNorm( const ET1& a, const Real& p )
 // since (1.0 / p) has type double, TNL::pow returns double
@@ -870,7 +636,7 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 product( const ET1& a )
 {
@@ -878,7 +644,15 @@ product( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
+auto
+logicalAnd( const ET1& a )
+{
+   return DistributedExpressionLogicalAnd( a );
+}
+
+template< typename ET1,
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 logicalOr( const ET1& a )
 {
@@ -886,15 +660,15 @@ logicalOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-logicalAnd( const ET1& a )
+binaryAnd( const ET1& a )
 {
-   return DistributedExpressionLogicalAnd( a );
+   return DistributedExpressionBinaryAnd( a );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
 binaryOr( const ET1& a )
 {
@@ -902,11 +676,11 @@ binaryOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfDistributedUnaryExpression_t< ET1 >, typename = void, typename = void >
+          typename..., EnableIfDistributedUnaryExpression_t< ET1, bool > = true >
 auto
-binaryAnd( const ET1& a )
+binaryXor( const ET1& a )
 {
-   return DistributedExpressionBinaryAnd( a );
+   return DistributedExpressionBinaryXor( a );
 }
 
 ////
@@ -962,6 +736,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index e1f850013a492bc071ae38cf031a381c1ae6d7e2..f3b826a9f736a1e368aa3b9fcef590abc077b07d 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -10,8 +10,8 @@
 
 #pragma once
 
-#include <TNL/Containers/Expressions/VerticalOperations.h>
 #include <TNL/MPI/Wrappers.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Containers {
@@ -26,7 +26,7 @@ auto DistributedExpressionMin( const Expression& expression ) -> std::decay_t< d
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionMin( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Min{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_MIN, expression.getCommunicationGroup() );
    }
    return result;
@@ -46,7 +46,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
    const auto group = expression.getCommunicationGroup();
    if( group != MPI::NullGroup() ) {
       // compute local argMin
-      ResultType localResult = ExpressionArgMin( expression.getConstLocalView() );
+      ResultType localResult = Algorithms::reduceWithArgument( expression.getConstLocalView(), TNL::MinWithArg{} );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
@@ -62,15 +62,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
       auto fetch = [_data] ( IndexType i ) { return _data[ i ].first; };
-      auto reduction = [] ( RealType& a, const RealType& b, IndexType& aIdx, const IndexType& bIdx ) {
-         if( a > b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, TNL::MinWithArg{} );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -85,7 +77,7 @@ auto DistributedExpressionMax( const Expression& expression ) -> std::decay_t< d
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::lowest();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionMax( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Max{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_MAX, expression.getCommunicationGroup() );
    }
    return result;
@@ -105,7 +97,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
    const auto group = expression.getCommunicationGroup();
    if( group != MPI::NullGroup() ) {
       // compute local argMax
-      ResultType localResult = ExpressionArgMax( expression.getConstLocalView() );
+      ResultType localResult = Algorithms::reduceWithArgument( expression.getConstLocalView(), TNL::MaxWithArg{} );
       // transform local index to global index
       localResult.second += expression.getLocalRange().getBegin();
 
@@ -121,15 +113,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
       // reduce the gathered data
       const auto* _data = gatheredResults;  // workaround for nvcc which does not allow to capture variable-length arrays (even in pure host code!)
       auto fetch = [_data] ( IndexType i ) { return _data[ i ].first; };
-      auto reduction = [] ( RealType& a, const RealType& b, IndexType& aIdx, const IndexType& bIdx ) {
-         if( a < b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, TNL::MaxWithArg{} );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -142,7 +126,7 @@ auto DistributedExpressionSum( const Expression& expression ) -> std::decay_t< d
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionSum( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Plus{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_SUM, expression.getCommunicationGroup() );
    }
    return result;
@@ -155,7 +139,7 @@ auto DistributedExpressionProduct( const Expression& expression ) -> std::decay_
 
    ResultType result = 1;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionProduct( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::Multiplies{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_PROD, expression.getCommunicationGroup() );
    }
    return result;
@@ -170,7 +154,7 @@ auto DistributedExpressionLogicalAnd( const Expression& expression ) -> std::dec
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalAnd( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::LogicalAnd{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_LAND, expression.getCommunicationGroup() );
    }
    return result;
@@ -183,14 +167,14 @@ auto DistributedExpressionLogicalOr( const Expression& expression ) -> std::deca
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalOr( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::LogicalOr{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_LOR, expression.getCommunicationGroup() );
    }
    return result;
 }
 
 template< typename Expression >
-auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] | expression[0] ) >
+auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< decltype( expression[0] & expression[0] ) >
 {
    using ResultType = std::decay_t< decltype( expression[0] & expression[0] ) >;
 
@@ -198,7 +182,7 @@ auto DistributedExpressionBinaryAnd( const Expression& expression ) -> std::deca
                   "std::numeric_limits is not specialized for the reduction's result type" );
    ResultType result = std::numeric_limits< ResultType >::max();
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionLogicalBinaryAnd( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitAnd{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_BAND, expression.getCommunicationGroup() );
    }
    return result;
@@ -211,12 +195,25 @@ auto DistributedExpressionBinaryOr( const Expression& expression ) -> std::decay
 
    ResultType result = 0;
    if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
-      const ResultType localResult = ExpressionBinaryOr( expression.getConstLocalView() );
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitOr{} );
       MPI::Allreduce( &localResult, &result, 1, MPI_BOR, expression.getCommunicationGroup() );
    }
    return result;
 }
 
+template< typename Expression >
+auto DistributedExpressionBinaryXor( const Expression& expression ) -> std::decay_t< decltype( expression[0] ^ expression[0] ) >
+{
+   using ResultType = std::decay_t< decltype( expression[0] ^ expression[0] ) >;
+
+   ResultType result = 0;
+   if( expression.getCommunicationGroup() != MPI::NullGroup() ) {
+      const ResultType localResult = Algorithms::reduce( expression.getConstLocalView(), TNL::BitXor{} );
+      MPI::Allreduce( &localResult, &result, 1, MPI_BXOR, expression.getCommunicationGroup() );
+   }
+   return result;
+}
+
 } // namespace Expressions
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 11b06e82269aed82ff76210d806e24f3341a1f5a..58d92609c922062da78b57ad2f27b43cfd9405fe 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -13,12 +13,12 @@
 #include <ostream>
 #include <utility>
 
+#include <TNL/Functional.h>
 #include <TNL/TypeTraits.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
 #include <TNL/Containers/Expressions/Comparison.h>
-#include <TNL/Containers/Expressions/HorizontalOperations.h>
-#include <TNL/Containers/Expressions/VerticalOperations.h>
+#include <TNL/Algorithms/reduce.h>
 
 namespace TNL {
 namespace Containers {
@@ -58,7 +58,8 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -79,13 +80,19 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ve
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1.getElement( i ), op2.getElement( i ) );
+      return Operation{}( op1.getElement( i ), op2.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1[ i ], op2[ i ] );
+      return Operation{}( op1[ i ], op2[ i ] );
+   }
+
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
    }
 
    __cuda_callable__
@@ -109,7 +116,8 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -122,13 +130,19 @@ struct BinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, Ar
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1.getElement( i ), op2 );
+      return Operation{}( op1.getElement( i ), op2 );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1[ i ], op2 );
+      return Operation{}( op1[ i ], op2 );
+   }
+
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
    }
 
    __cuda_callable__
@@ -152,7 +166,8 @@ template< typename T1,
           typename Operation >
 struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable >
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T2::DeviceType;
    using IndexType = typename T2::IndexType;
    using ConstViewType = BinaryExpressionTemplate;
@@ -165,13 +180,19 @@ struct BinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorEx
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( op1, op2.getElement( i ) );
+      return Operation{}( op1, op2.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( op1, op2[ i ] );
+      return Operation{}( op1, op2[ i ] );
+   }
+
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
    }
 
    __cuda_callable__
@@ -196,7 +217,8 @@ template< typename T1,
           typename Operation >
 struct UnaryExpressionTemplate
 {
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
    using DeviceType = typename T1::DeviceType;
    using IndexType = typename T1::IndexType;
    using ConstViewType = UnaryExpressionTemplate;
@@ -209,13 +231,19 @@ struct UnaryExpressionTemplate
 
    RealType getElement( const IndexType i ) const
    {
-      return Operation::evaluate( operand.getElement( i ) );
+      return Operation{}( operand.getElement( i ) );
    }
 
    __cuda_callable__
    RealType operator[]( const IndexType i ) const
    {
-      return Operation::evaluate( operand[ i ] );
+      return Operation{}( operand[ i ] );
+   }
+
+   __cuda_callable__
+   RealType operator()( const IndexType i ) const
+   {
+      return operator[]( i );
    }
 
    __cuda_callable__
@@ -235,50 +263,86 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
-
-////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
+#define TNL_MAKE_UNARY_EXPRESSION(fname, functor)                                \
+   template< typename ET1,                                                       \
+             typename..., EnableIfUnaryExpression_t< ET1, bool > = true >        \
+   auto                                                                          \
+   fname( const ET1& a )                                                         \
+   {                                                                             \
+      return UnaryExpressionTemplate< ET1, functor >( a );                       \
+   }                                                                             \
+
+#define TNL_MAKE_BINARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1, typename ET2,                                         \
+             typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >  \
+   auto                                                                          \
+   fname( const ET1& a, const ET2& b )                                           \
+   {                                                                             \
+      return BinaryExpressionTemplate< ET1, ET2, functor >( a, b );              \
+   }                                                                             \
+
+TNL_MAKE_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_UNARY_EXPRESSION
+#undef TNL_MAKE_BINARY_EXPRESSION
 
 ////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return BinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return BinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return BinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return UnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator==( const ET1& a, const ET2& b )
 {
@@ -288,7 +352,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator!=( const ET1& a, const ET2& b )
 {
@@ -298,7 +362,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<( const ET1& a, const ET2& b )
 {
@@ -308,7 +372,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator<=( const ET1& a, const ET2& b )
 {
@@ -318,7 +382,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>( const ET1& a, const ET2& b )
 {
@@ -328,7 +392,7 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 bool
 operator>=( const ET1& a, const ET2& b )
 {
@@ -338,339 +402,65 @@ operator>=( const ET1& a, const ET2& b )
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 operator,( const ET1& a, const ET2& b )
 {
-   return ExpressionSum( a * b );
+   return Algorithms::reduce( a * b, TNL::Plus{} );
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
+          typename..., EnableIfBinaryExpression_t< ET1, ET2, bool > = true >
 auto
 dot( const ET1& a, const ET2& b )
 {
    return (a, b);
 }
 
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-operator-( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Minus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-min( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfBinaryExpression_t< ET1, ET2 >, typename = void >
-auto
-max( const ET1& a, const ET2& b )
-{
-   return BinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-abs( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return BinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-exp( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sqrt( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cbrt( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log10( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-log2( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sin( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cos( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-tan( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-asin( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-acos( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-atan( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sinh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-cosh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-tanh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-asinh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-acosh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-atanh( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-floor( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-ceil( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
-auto
-sign( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation,
-          typename = void >
-auto
-cast( const ET1& a )
-{
-   return UnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 min( const ET1& a )
 {
-   return ExpressionMin( a );
+   return Algorithms::reduce( a, TNL::Min{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 argMin( const ET1& a )
 {
-   return ExpressionArgMin( a );
+   return Algorithms::reduceWithArgument( a, TNL::MinWithArg{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 max( const ET1& a )
 {
-   return ExpressionMax( a );
+   return Algorithms::reduce( a, TNL::Max{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 argMax( const ET1& a )
 {
-   return ExpressionArgMax( a );
+   return Algorithms::reduceWithArgument( a, TNL::MaxWithArg{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 sum( const ET1& a )
 {
-   return ExpressionSum( a );
+   return Algorithms::reduce( a, TNL::Plus{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 maxNorm( const ET1& a )
 {
@@ -678,7 +468,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 l1Norm( const ET1& a )
 {
@@ -686,7 +476,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 l2Norm( const ET1& a )
 {
@@ -696,7 +486,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 lpNorm( const ET1& a, const Real& p )
 // since (1.0 / p) has type double, TNL::pow returns double
@@ -711,43 +501,51 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 product( const ET1& a )
 {
-   return ExpressionProduct( a );
+   return Algorithms::reduce( a, TNL::Multiplies{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 logicalAnd( const ET1& a )
 {
-   return ExpressionLogicalAnd( a );
+   return Algorithms::reduce( a, TNL::LogicalAnd{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 logicalOr( const ET1& a )
 {
-   return ExpressionLogicalOr( a );
+   return Algorithms::reduce( a, TNL::LogicalOr{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 binaryAnd( const ET1& a )
 {
-   return ExpressionBinaryAnd( a );
+   return Algorithms::reduce( a, TNL::BitAnd{} );
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfUnaryExpression_t< ET1 >, typename = void >
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
 auto
 binaryOr( const ET1& a )
 {
-   return ExpressionBinaryOr( a );
+   return Algorithms::reduce( a, TNL::BitOr{} );
+}
+
+template< typename ET1,
+          typename..., EnableIfUnaryExpression_t< ET1, bool > = true >
+auto
+binaryXor( const ET1& a )
+{
+   return Algorithms::reduce( a, TNL::BitXor{} );
 }
 
 #endif // DOXYGEN_ONLY
@@ -785,6 +583,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/HorizontalOperations.h b/src/TNL/Containers/Expressions/HorizontalOperations.h
deleted file mode 100644
index 614f2c878ebc0a744f5f926ccb730e945d8c6dda..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Expressions/HorizontalOperations.h
+++ /dev/null
@@ -1,335 +0,0 @@
-/***************************************************************************
-                          HorizontalOperations.h  -  description
-                             -------------------
-    begin                : Apr 18, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Math.h>
-
-namespace TNL {
-namespace Containers {
-namespace Expressions {
-
-struct Addition
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a + b )
-   {
-      return a + b;
-   }
-};
-
-struct Subtraction
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a - b )
-   {
-      return a - b;
-   }
-};
-
-struct Multiplication
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a * b )
-   {
-      return a * b;
-   }
-};
-
-struct Division
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( a / b )
-   {
-      return a / b;
-   }
-};
-
-struct Min
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( min( a, b ) )
-   {
-      return min( a, b );
-   }
-};
-
-struct Max
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& b ) -> decltype( max( a, b ) )
-   {
-      return max( a, b );
-   }
-};
-
-struct Minus
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( -a )
-   {
-      return -a;
-   }
-};
-
-struct Abs
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( abs( a ) )
-   {
-      return abs( a );
-   }
-};
-
-struct Pow
-{
-   template< typename T1, typename T2 >
-   __cuda_callable__
-   static auto evaluate( const T1& a, const T2& exp ) -> decltype( pow( a, exp ) )
-   {
-      return pow( a, exp );
-   }
-};
-
-struct Exp
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( exp( a ) )
-   {
-      return exp( a );
-   }
-};
-
-struct Sqrt
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sqrt( a ) )
-   {
-      return sqrt( a );
-   }
-};
-
-struct Cbrt
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cbrt( a ) )
-   {
-      return cbrt( a );
-   }
-};
-
-struct Log
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log( a ) )
-   {
-      return log( a );
-   }
-};
-
-struct Log10
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log10( a ) )
-   {
-      return log10( a );
-   }
-};
-
-struct Log2
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( log2( a ) )
-   {
-      return log2( a );
-   }
-};
-
-struct Sin
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sin( a ) )
-   {
-      return sin( a );
-   }
-};
-
-struct Cos
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cos( a ) )
-   {
-      return cos( a );
-   }
-};
-
-struct Tan
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( tan( a ) )
-   {
-      return tan( a );
-   }
-};
-
-struct Asin
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( asin( a ) )
-   {
-      return asin( a );
-   }
-};
-
-struct Acos
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( acos( a ) )
-   {
-      return acos( a );
-   }
-};
-
-struct Atan
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( atan( a ) )
-   {
-      return atan( a );
-   }
-};
-
-struct Sinh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sinh( a ) )
-   {
-      return sinh( a );
-   }
-};
-
-struct Cosh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( cosh( a ) )
-   {
-      return cosh( a );
-   }
-};
-
-struct Tanh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( tanh( a ) )
-   {
-      return tanh( a );
-   }
-};
-
-struct Asinh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( asinh( a ) )
-   {
-      return asinh( a );
-   }
-};
-
-struct Acosh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( acosh( a ) )
-   {
-      return acosh( a );
-   }
-};
-
-struct Atanh
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( atanh( a ) )
-   {
-      return atanh( a );
-   }
-};
-
-struct Floor
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( floor( a ) )
-   {
-      return floor( a );
-   }
-};
-
-struct Ceil
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( ceil( a ) )
-   {
-      return ceil( a );
-   }
-};
-
-struct Sign
-{
-   template< typename T1 >
-   __cuda_callable__
-   static auto evaluate( const T1& a ) -> decltype( sign( a ) )
-   {
-      return sign( a );
-   }
-};
-
-template< typename ResultType >
-struct Cast
-{
-   struct Operation
-   {
-      template< typename T1 >
-      __cuda_callable__
-      static auto evaluate( const T1& a ) -> ResultType
-      {
-         return static_cast<ResultType>( a );
-      }
-   };
-};
-
-} // namespace Expressions
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index da2c8cdd2f4ebf5f3bfb15a141ed6c739ef91a5c..102656e05556bb719e6d1602b779e6b8a15e6c60 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -13,10 +13,10 @@
 #include <ostream>
 #include <utility>
 
+#include <TNL/Functional.h>
 #include <TNL/TypeTraits.h>
 #include <TNL/Containers/Expressions/TypeTraits.h>
 #include <TNL/Containers/Expressions/ExpressionVariableType.h>
-#include <TNL/Containers/Expressions/HorizontalOperations.h>
 #include <TNL/Containers/Expressions/StaticComparison.h>
 #include <TNL/Containers/Expressions/StaticVerticalOperations.h>
 
@@ -59,7 +59,8 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, VectorExpressionVariable >
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "Left-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -81,7 +82,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1[ i ], op2[ i ] );
+      return Operation{}( op1[ i ], op2[ i ] );
    }
 
    __cuda_callable__
@@ -113,7 +114,8 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariable, ArithmeticVariable  >
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0], std::declval<T2>() ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0], std::declval<T2>() ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "Left-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -129,7 +131,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, VectorExpressionVariab
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1[ i ], op2 );
+      return Operation{}( op1[ i ], op2 );
    }
 
    __cuda_callable__
@@ -161,7 +163,8 @@ template< typename T1,
 struct StaticBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, VectorExpressionVariable  >
 {
    using VectorOperandType = T2;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>(), std::declval<T2>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T2 >::value,
                   "Right-hand side operand of static expression is not static, i.e. based on static vector." );
@@ -177,7 +180,7 @@ struct StaticBinaryExpressionTemplate< T1, T2, Operation, ArithmeticVariable, Ve
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( op1, op2[ i ] );
+      return Operation{}( op1, op2[ i ] );
    }
 
    __cuda_callable__
@@ -210,7 +213,8 @@ template< typename T1,
 struct StaticUnaryExpressionTemplate
 {
    using VectorOperandType = T1;
-   using RealType = decltype( Operation::evaluate( std::declval<T1>()[0] ) );
+   using RealType = decltype( Operation{}( std::declval<T1>()[0] ) );
+   using ValueType = RealType;
 
    static_assert( IsStaticArrayType< T1 >::value,
                   "The operand of static expression is not static, i.e. based on static vector." );
@@ -226,7 +230,7 @@ struct StaticUnaryExpressionTemplate
    __cuda_callable__
    RealType operator[]( const int i ) const
    {
-      return Operation::evaluate( operand[ i ] );
+      return Operation{}( operand[ i ] );
    }
 
    __cuda_callable__
@@ -253,54 +257,90 @@ protected:
 
 #ifndef DOXYGEN_ONLY
 
-////
-// Binary expressions addition
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-operator+( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Addition >( a, b );
-}
+#define TNL_MAKE_STATIC_UNARY_EXPRESSION(fname, functor)                               \
+   template< typename ET1,                                                             \
+             typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >        \
+   __cuda_callable__                                                                   \
+   auto                                                                                \
+   fname( const ET1& a )                                                               \
+   {                                                                                   \
+      return StaticUnaryExpressionTemplate< ET1, functor >( a );                       \
+   }                                                                                   \
+
+#define TNL_MAKE_STATIC_BINARY_EXPRESSION(fname, functor)                              \
+   template< typename ET1, typename ET2,                                               \
+             typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >  \
+   __cuda_callable__                                                                   \
+   auto                                                                                \
+   fname( const ET1& a, const ET2& b )                                                 \
+   {                                                                                   \
+      return StaticBinaryExpressionTemplate< ET1, ET2, functor >( a, b );              \
+   }                                                                                   \
+
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator+, TNL::Plus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator-, TNL::Minus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator*, TNL::Multiplies )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator/, TNL::Divides )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( operator%, TNL::Modulus )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( min, TNL::Min )
+TNL_MAKE_STATIC_BINARY_EXPRESSION( max, TNL::Max )
+
+TNL_MAKE_STATIC_UNARY_EXPRESSION( operator+, TNL::UnaryPlus )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( operator-, TNL::UnaryMinus )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( abs, TNL::Abs )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( exp, TNL::Exp )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sqrt, TNL::Sqrt )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cbrt, TNL::Cbrt )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log, TNL::Log )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log10, TNL::Log10 )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( log2, TNL::Log2 )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sin, TNL::Sin )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cos, TNL::Cos )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( tan, TNL::Tan )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( asin, TNL::Asin )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( acos, TNL::Acos )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( atan, TNL::Atan )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sinh, TNL::Sinh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( cosh, TNL::Cosh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( tanh, TNL::Tanh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( asinh, TNL::Asinh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( acosh, TNL::Acosh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( atanh, TNL::Atanh )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( floor, TNL::Floor )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( ceil, TNL::Ceil )
+TNL_MAKE_STATIC_UNARY_EXPRESSION( sign, TNL::Sign )
+
+#undef TNL_MAKE_STATIC_UNARY_EXPRESSION
+#undef TNL_MAKE_STATIC_BINARY_EXPRESSION
 
 ////
-// Binary expression subtraction
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-operator-( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Subtraction >( a, b );
-}
-
-////
-// Binary expression multiplication
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+// Pow
+template< typename ET1, typename Real,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
-operator*( const ET1& a, const ET2& b )
+pow( const ET1& a, const Real& exp )
 {
-   return StaticBinaryExpressionTemplate< ET1, ET2, Multiplication >( a, b );
+   return StaticBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
 }
 
 ////
-// Binary expression division
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+// Cast
+template< typename ResultType,
+          typename ET1,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
-operator/( const ET1& a, const ET2& b )
+cast( const ET1& a )
 {
-   return StaticBinaryExpressionTemplate< ET1, ET2, Division >( a, b );
+   using CastOperation = typename Cast< ResultType >::Operation;
+   return StaticUnaryExpressionTemplate< ET1, CastOperation >( a );
 }
 
 ////
 // Comparison operator ==
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator==( const ET1& a, const ET2& b )
@@ -311,7 +351,7 @@ operator==( const ET1& a, const ET2& b )
 ////
 // Comparison operator !=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator!=( const ET1& a, const ET2& b )
@@ -322,7 +362,7 @@ operator!=( const ET1& a, const ET2& b )
 ////
 // Comparison operator <
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator<( const ET1& a, const ET2& b )
@@ -333,7 +373,7 @@ operator<( const ET1& a, const ET2& b )
 ////
 // Comparison operator <=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator<=( const ET1& a, const ET2& b )
@@ -344,7 +384,7 @@ operator<=( const ET1& a, const ET2& b )
 ////
 // Comparison operator >
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator>( const ET1& a, const ET2& b )
@@ -355,18 +395,18 @@ operator>( const ET1& a, const ET2& b )
 ////
 // Comparison operator >=
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 bool
 operator>=( const ET1& a, const ET2& b )
 {
-   return Expressions::StaticComparison< ET1, ET2 >::GE( a, b );
+   return StaticComparison< ET1, ET2 >::GE( a, b );
 }
 
 ////
 // Scalar product
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 auto
 operator,( const ET1& a, const ET2& b )
@@ -375,7 +415,7 @@ operator,( const ET1& a, const ET2& b )
 }
 
 template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
+          typename..., EnableIfStaticBinaryExpression_t< ET1, ET2, bool > = true >
 __cuda_callable__
 auto
 dot( const ET1& a, const ET2& b )
@@ -383,310 +423,10 @@ dot( const ET1& a, const ET2& b )
    return (a, b);
 }
 
-////
-// Unary expression minus
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-operator-( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Minus >( a );
-}
-
-////
-// Binary expression min
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-min( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Min >( a, b );
-}
-
-////
-// Binary expression max
-template< typename ET1, typename ET2,
-          typename..., typename = EnableIfStaticBinaryExpression_t< ET1, ET2 > >
-__cuda_callable__
-auto
-max( const ET1& a, const ET2& b )
-{
-   return StaticBinaryExpressionTemplate< ET1, ET2, Max >( a, b );
-}
-
-////
-// Abs
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-abs( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Abs >( a );
-}
-
-////
-// Pow
-template< typename ET1, typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-pow( const ET1& a, const Real& exp )
-{
-   return StaticBinaryExpressionTemplate< ET1, Real, Pow >( a, exp );
-}
-
-////
-// Exp
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-exp( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Exp >( a );
-}
-
-////
-// Sqrt
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sqrt( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sqrt >( a );
-}
-
-////
-// Cbrt
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cbrt( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cbrt >( a );
-}
-
-////
-// Log
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log >( a );
-}
-
-////
-// Log10
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log10( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log10 >( a );
-}
-
-////
-// Log2
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-log2( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Log2 >( a );
-}
-
-////
-// Sin
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sin( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sin >( a );
-}
-
-////
-// Cos
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cos( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cos >( a );
-}
-
-////
-// Tan
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-tan( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Tan >( a );
-}
-
-////
-// Asin
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-asin( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Asin >( a );
-}
-
-////
-// Acos
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-acos( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Acos >( a );
-}
-
-////
-// Atan
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-atan( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Atan >( a );
-}
-
-////
-// Sinh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sinh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sinh >( a );
-}
-
-////
-// Cosh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-cosh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Cosh >( a );
-}
-
-////
-// Tanh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-tanh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Tanh >( a );
-}
-
-////
-// Asinh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-asinh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Asinh >( a );
-}
-
-////
-// Acosh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-acosh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Acosh >( a );
-}
-
-////
-// Atanh
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-atanh( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Atanh >( a );
-}
-
-////
-// Floor
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-floor( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Floor >( a );
-}
-
-////
-// Ceil
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-ceil( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Ceil >( a );
-}
-
-////
-// Sign
-template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
-__cuda_callable__
-auto
-sign( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, Sign >( a );
-}
-
-////
-// Cast
-template< typename ResultType,
-          typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
-          // workaround: templated type alias cannot be declared at block level
-          typename CastOperation = typename Cast< ResultType >::Operation >
-__cuda_callable__
-auto
-cast( const ET1& a )
-{
-   return StaticUnaryExpressionTemplate< ET1, CastOperation >( a );
-}
-
 ////
 // Vertical operations
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 min( const ET1& a )
@@ -695,7 +435,7 @@ min( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 argMin( const ET1& a )
@@ -704,7 +444,7 @@ argMin( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 max( const ET1& a )
@@ -713,7 +453,7 @@ max( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 argMax( const ET1& a )
@@ -722,7 +462,7 @@ argMax( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 sum( const ET1& a )
@@ -731,7 +471,7 @@ sum( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 maxNorm( const ET1& a )
@@ -740,7 +480,7 @@ maxNorm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 l1Norm( const ET1& a )
@@ -749,7 +489,7 @@ l1Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< (ET1::getSize() > 1), bool > = true >
 __cuda_callable__
 auto
@@ -760,7 +500,7 @@ l2Norm( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< ET1::getSize() == 1, bool > = true >
 __cuda_callable__
 auto
@@ -772,7 +512,7 @@ l2Norm( const ET1& a )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< (ET1::getSize() > 1), bool > = true >
 __cuda_callable__
 auto
@@ -791,7 +531,7 @@ lpNorm( const ET1& a, const Real& p )
 
 template< typename ET1,
           typename Real,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 >,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true,
           std::enable_if_t< ET1::getSize() == 1, bool > = true >
 __cuda_callable__
 auto
@@ -802,7 +542,7 @@ lpNorm( const ET1& a, const Real& p )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 product( const ET1& a )
@@ -811,7 +551,7 @@ product( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 logicalAnd( const ET1& a )
@@ -820,7 +560,7 @@ logicalAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 logicalOr( const ET1& a )
@@ -829,7 +569,7 @@ logicalOr( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 binaryAnd( const ET1& a )
@@ -838,7 +578,7 @@ binaryAnd( const ET1& a )
 }
 
 template< typename ET1,
-          typename..., typename = EnableIfStaticUnaryExpression_t< ET1 > >
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
 __cuda_callable__
 auto
 binaryOr( const ET1& a )
@@ -846,6 +586,15 @@ binaryOr( const ET1& a )
    return StaticExpressionBinaryOr( a );
 }
 
+template< typename ET1,
+          typename..., EnableIfStaticUnaryExpression_t< ET1, bool > = true >
+__cuda_callable__
+auto
+binaryXor( const ET1& a )
+{
+   return StaticExpressionBinaryXor( a );
+}
+
 #endif // DOXYGEN_ONLY
 
 ////
@@ -881,6 +630,7 @@ using Expressions::operator+;
 using Expressions::operator-;
 using Expressions::operator*;
 using Expressions::operator/;
+using Expressions::operator%;
 using Expressions::operator,;
 using Expressions::operator==;
 using Expressions::operator!=;
diff --git a/src/TNL/Containers/Expressions/StaticVerticalOperations.h b/src/TNL/Containers/Expressions/StaticVerticalOperations.h
index b958830031a260f1240be6ca1e5d6f7978fb3c47..fac7cf244ee5facc0c87f65655c8e34f41d596af 100644
--- a/src/TNL/Containers/Expressions/StaticVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/StaticVerticalOperations.h
@@ -145,6 +145,16 @@ auto StaticExpressionBinaryOr( const Expression& expression )
    return aux;
 }
 
+template< typename Expression >
+__cuda_callable__
+auto StaticExpressionBinaryXor( const Expression& expression )
+{
+   auto aux = expression[ 0 ];
+   for( int i = 1; i < expression.getSize(); i++ )
+      aux = aux ^ expression[ i ];
+   return aux;
+}
+
 } // namespace Expressions
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/TypeTraits.h b/src/TNL/Containers/Expressions/TypeTraits.h
index 3142ee35ec1f4c7829cb616affb901af6e3f2573..9af3ef8184505a96c55784fdeff556f7dde2a0f5 100644
--- a/src/TNL/Containers/Expressions/TypeTraits.h
+++ b/src/TNL/Containers/Expressions/TypeTraits.h
@@ -31,11 +31,12 @@ struct HasEnabledDistributedExpressionTemplates : std::false_type
 
 
 // type aliases for enabling specific operators and functions using SFINAE
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfStaticUnaryExpression_t = std::enable_if_t<
-      HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfStaticBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledStaticExpressionTemplates< std::decay_t< ET1 > >::value ||
@@ -46,13 +47,15 @@ using EnableIfStaticBinaryExpression_t = std::enable_if_t<
          HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfUnaryExpression_t = std::enable_if_t<
-      HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfBinaryExpression_t = std::enable_if_t<
       // we need to avoid ambiguity with operators defined in Array (e.g. Array::operator==)
       // so the first operand must not be Array
@@ -64,13 +67,15 @@ using EnableIfBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
-template< typename ET1 >
+template< typename ET1, typename T = void >
 using EnableIfDistributedUnaryExpression_t = std::enable_if_t<
-      HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value >;
+         HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value,
+      T >;
 
-template< typename ET1, typename ET2 >
+template< typename ET1, typename ET2, typename T = void >
 using EnableIfDistributedBinaryExpression_t = std::enable_if_t<
       // we need to avoid ambiguity with operators defined in Array (e.g. Array::operator==)
       // so the first operand must not be Array
@@ -82,7 +87,8 @@ using EnableIfDistributedBinaryExpression_t = std::enable_if_t<
       (
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET2 > >::value ||
          HasEnabledDistributedExpressionTemplates< std::decay_t< ET1 > >::value
-      ) >;
+      ),
+      T >;
 
 
 // helper trait class for recursively turning expression template classes into compatible vectors
@@ -107,20 +113,12 @@ using RemoveET = typename RemoveExpressionTemplate< R >::type;
 
 template< typename T1, typename T2 >
 constexpr std::enable_if_t<
-      ! ( std::is_arithmetic< T1 >::value && std::is_arithmetic< T2 >::value ) &&
       ! ( IsStaticArrayType< T1 >::value && IsStaticArrayType< T2 >::value ) &&
       ! ( IsArrayType< T1 >::value && IsArrayType< T2 >::value )
 , bool >
 compatibleForVectorAssignment()
 {
-   return false;
-}
-
-template< typename T1, typename T2 >
-constexpr std::enable_if_t< std::is_arithmetic< T1 >::value && std::is_arithmetic< T2 >::value, bool >
-compatibleForVectorAssignment()
-{
-   return true;
+   return IsScalarType< T1 >::value && IsScalarType< T2 >::value;
 }
 
 template< typename T1, typename T2 >
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
deleted file mode 100644
index ff094e4ea9d29ef9eef3aee3461d0029662e5c35..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/***************************************************************************
-                          VerticalOperations.h  -  description
-                             -------------------
-    begin                : May 1, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <limits>
-#include <type_traits>
-
-#include <TNL/Algorithms/Reduction.h>
-#include <TNL/Containers/Expressions/TypeTraits.h>
-
-////
-// By vertical operations we mean those applied across vector elements or
-// vector expression elements. It means for example minim/maximum of all
-// vector elements etc.
-namespace TNL {
-namespace Containers {
-namespace Expressions {
-
-////
-// Vertical operations
-template< typename Expression >
-auto ExpressionMin( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b )
-   {
-      // use argument-dependent lookup and make TNL::min available for unqualified calls
-      using TNL::min;
-      return min( a, b );
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
-}
-
-template< typename Expression >
-auto ExpressionArgMin( const Expression& expression )
--> RemoveET< std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( ResultType& a, const ResultType& b, IndexType& aIdx, const IndexType& bIdx ) {
-      if( a > b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
-}
-
-template< typename Expression >
-auto ExpressionMax( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b )
-   {
-      // use argument-dependent lookup and make TNL::max available for unqualified calls
-      using TNL::max;
-      return max( a, b );
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
-}
-
-template< typename Expression >
-auto ExpressionArgMax( const Expression& expression )
--> RemoveET< std::pair< std::decay_t< decltype( expression[0] ) >, typename Expression::IndexType > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   auto reduction = [] __cuda_callable__ ( ResultType& a, const ResultType& b, IndexType& aIdx, const IndexType& bIdx ) {
-      if( a < b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
-}
-
-template< typename Expression >
-auto ExpressionSum( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] + expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] + expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
-}
-
-template< typename Expression >
-auto ExpressionProduct( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] * expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] * expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
-}
-
-template< typename Expression >
-auto ExpressionLogicalAnd( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] && expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] && expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
-}
-
-template< typename Expression >
-auto ExpressionLogicalOr( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] || expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] || expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
-}
-
-template< typename Expression >
-auto ExpressionBinaryAnd( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] & expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] & expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   static_assert( std::numeric_limits< ResultType >::is_specialized,
-                  "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
-}
-
-template< typename Expression >
-auto ExpressionBinaryOr( const Expression& expression )
--> RemoveET< std::decay_t< decltype( expression[0] | expression[0] ) > >
-{
-   using ResultType = RemoveET< std::decay_t< decltype( expression[0] | expression[0] ) > >;
-   using IndexType = typename Expression::IndexType;
-
-   const auto view = expression.getConstView();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
-}
-
-} // namespace Expressions
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/StaticArray.h b/src/TNL/Containers/StaticArray.h
index 4f7f753c223fb24c75f0c08a011848d98927c451..ba5f65ed27cbcf3cbee6febfc72060fae601c017 100644
--- a/src/TNL/Containers/StaticArray.h
+++ b/src/TNL/Containers/StaticArray.h
@@ -135,6 +135,22 @@ public:
    __cuda_callable__
    Value& operator[]( int i );
 
+   /**
+    * \brief Accesses specified element at the position \e i and returns a constant reference to its value.
+    *
+    * Equivalent to \ref operator[].
+    */
+   __cuda_callable__
+   const Value& operator()( int i ) const;
+
+   /**
+    * \brief Accesses specified element at the position \e i and returns a reference to its value.
+    *
+    * Equivalent to \ref operator[].
+    */
+   __cuda_callable__
+   Value& operator()( int i );
+
    /**
     * \brief Returns reference to the first coordinate.
     */
diff --git a/src/TNL/Containers/StaticArray.hpp b/src/TNL/Containers/StaticArray.hpp
index c6c18fb0b2a6bde7467e2606bc08439e15ff7b3e..507a84a633e9b1fbe0c0d81d382d242fee32ab2d 100644
--- a/src/TNL/Containers/StaticArray.hpp
+++ b/src/TNL/Containers/StaticArray.hpp
@@ -190,6 +190,21 @@ Value& StaticArray< Size, Value >::operator[]( int i )
    TNL_ASSERT_LT( i, Size, "Element index is out of bounds." );
    return data[ i ];
 }
+
+template< int Size, typename Value >
+__cuda_callable__
+const Value& StaticArray< Size, Value >::operator()( int i ) const
+{
+   return operator[]( i );
+}
+
+template< int Size, typename Value >
+__cuda_callable__
+Value& StaticArray< Size, Value >::operator()( int i )
+{
+   return operator[]( i );
+}
+
 template< int Size, typename Value >
 __cuda_callable__
 Value& StaticArray< Size, Value >::x()
@@ -280,11 +295,7 @@ StaticArray< Size, Value >::
 operator StaticArray< Size, OtherValue >() const
 {
    StaticArray< Size, OtherValue > aux;
-   Algorithms::unrolledFor< int, 0, Size >(
-      [&] ( int i ) mutable {
-         aux[ i ] = (*this)[ i ];
-      }
-   );
+   aux.operator=( *this );
    return aux;
 }
 
diff --git a/src/TNL/Containers/StaticVector.h b/src/TNL/Containers/StaticVector.h
index 13cdd0fbc8ba583c4bcab601f568a2712076226f..da310a1ca43e073c01f1cc96e45b2c07d899a8b8 100644
--- a/src/TNL/Containers/StaticVector.h
+++ b/src/TNL/Containers/StaticVector.h
@@ -70,7 +70,7 @@ public:
 
    /**
     * \brief Constructor from binary vector expression.
-    * 
+    *
     * \param expr is binary expression.
     */
    template< typename T1,
@@ -81,7 +81,7 @@ public:
 
    /**
     * \brief Constructor from unary expression.
-    * 
+    *
     * \param expr is unary expression
     */
    template< typename T,
@@ -100,9 +100,9 @@ public:
 
    /**
     * \brief Assignment operator with a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -112,9 +112,9 @@ public:
 
    /**
     * \brief Addition operator with a vector expression
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -124,9 +124,9 @@ public:
 
    /**
     * \brief Subtraction operator with a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -138,7 +138,7 @@ public:
     * \brief Elementwise multiplication by a vector expression.
     *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression.
     * \return reference to this vector
     */
@@ -148,9 +148,9 @@ public:
 
    /**
     * \brief Elementwise division by a vector expression.
-    * 
+    *
     * The vector expression can be even just static vector.
-    * 
+    *
     * \param expression is the vector expression
     * \return reference to this vector
     */
@@ -158,15 +158,27 @@ public:
    __cuda_callable__
    StaticVector& operator/=( const VectorExpression& expression );
 
+   /**
+    * \brief Elementwise modulo by a vector expression.
+    *
+    * The vector expression can be even just static vector.
+    *
+    * \param expression is the vector expression
+    * \return reference to this vector
+    */
+   template< typename VectorExpression >
+   __cuda_callable__
+   StaticVector& operator%=( const VectorExpression& expression );
+
    /**
     * \brief Cast operator for changing of the \e Value type.
-    * 
+    *
     * Returns static array having \e ValueType set to \e OtherValue, i.e.
     * StaticArray< Size, OtherValue >.
-    * 
-    * \tparam OtherValue is the \e Value type of the static array the casting 
+    *
+    * \tparam OtherValue is the \e Value type of the static array the casting
     * will be performed to.
-    * 
+    *
     * \return instance of StaticVector< Size, OtherValue >
     */
    template< typename OtherReal >
diff --git a/src/TNL/Containers/StaticVector.hpp b/src/TNL/Containers/StaticVector.hpp
index bb22eba8c03a4d352638fd0005524586bd947f2f..b995dc11a8130c96cf5ac16607b73a28e9ff680a 100644
--- a/src/TNL/Containers/StaticVector.hpp
+++ b/src/TNL/Containers/StaticVector.hpp
@@ -92,6 +92,15 @@ StaticVector< Size, Real >& StaticVector< Size, Real >::operator/=( const Vector
    return *this;
 }
 
+template< int Size, typename Real >
+   template< typename VectorExpression >
+__cuda_callable__
+StaticVector< Size, Real >& StaticVector< Size, Real >::operator%=( const VectorExpression& expression )
+{
+   detail::VectorAssignmentWithOperation< StaticVector, VectorExpression >::moduloStatic( *this, expression );
+   return *this;
+}
+
 template< int Size, typename Real >
    template< typename OtherReal >
 __cuda_callable__
@@ -99,11 +108,7 @@ StaticVector< Size, Real >::
 operator StaticVector< Size, OtherReal >() const
 {
    StaticVector< Size, OtherReal > aux;
-   Algorithms::unrolledFor< int, 0, Size >(
-      [&] ( int i ) mutable {
-         aux[ i ] = (*this)[ i ];
-      }
-   );
+   aux.operator=( *this );
    return aux;
 }
 
diff --git a/src/TNL/Containers/Vector.h b/src/TNL/Containers/Vector.h
index 859e326d483441019edaa1649e69649a31bacbfe..b6708563a5141efd958afdf91825b5a454866c58 100644
--- a/src/TNL/Containers/Vector.h
+++ b/src/TNL/Containers/Vector.h
@@ -258,84 +258,16 @@ public:
    Vector& operator/=( const VectorExpression& expression );
 
    /**
-    * \brief Computes the scan (prefix sum) of the vector elements.
+    * \brief Modulo assignment operator for vector and a vector expression.
     *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of the vector elements.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename FlagsArray >
-   void segmentedScan( FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector expression.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression >
-   void scan( const VectorExpression& expression, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of a vector expression.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
+    * The division is evaluated element-wise. The vector expression must
+    * either evaluate to a scalar or a vector of the same size as this vector.
     *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
+    * \param expression Reference to a vector expression.
+    * \return Reference to this vector.
     */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression,
-             typename FlagsArray >
-   void segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
+   template< typename VectorExpression >
+   Vector& operator%=( const VectorExpression& expression );
 };
 
 // Enable expression templates for Vector
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index b25ccbb5ac5ef117d17c024de38f5549d77f0b9e..6ab91cd3e7fa74344847c0c23d4c330f05852d05 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -11,7 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/Vector.h>
-#include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
 namespace Containers {
@@ -157,56 +156,13 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Allocator >
-   template< Algorithms::ScanType Type >
-void
-Vector< Real, Device, Index, Allocator >::
-scan( IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::Scan< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename FlagsArray >
-void
-Vector< Real, Device, Index, Allocator >::
-segmentedScan( FlagsArray& flags, IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::SegmentedScan< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression >
-void
-Vector< Real, Device, Index, Allocator >::
-scan( const VectorExpression& expression, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Scan (prefix sum) with vector expressions is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Allocator >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression,
-             typename FlagsArray >
-void
+   template< typename VectorExpression >
+Vector< Real, Device, Index, Allocator >&
 Vector< Real, Device, Index, Allocator >::
-segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin, IndexType end )
+operator%=( const VectorExpression& expression )
 {
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) with vector expressions is not implemented." );
+   detail::VectorAssignmentWithOperation< Vector, VectorExpression >::modulo( *this, expression );
+   return *this;
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 2416b85095597398a5d25228e8ac6ee50bf37fb2..04d23b0659f5fdf15221750d269a5734871a4cf8 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -14,7 +14,6 @@
 
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
-#include <TNL/Algorithms/Scan.h>
 
 namespace TNL {
 namespace Containers {
@@ -216,84 +215,17 @@ public:
    VectorView& operator/=( const VectorExpression& expression );
 
    /**
-    * \brief Computes the scan (prefix sum) of the vector elements.
+    * \brief Modulo assignment operator for vector view and a vector expression.
     *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive >
-   void scan( IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of the vector elements.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename FlagsArray >
-   void segmentedScan( FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the scan (prefix sum) of the vector expression.
-    *
-    * By default, scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
-    *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
-    */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression >
-   void scan( const VectorExpression& expression, IndexType begin = 0, IndexType end = 0 );
-
-   /**
-    * \brief Computes the segmented scan (prefix sum) of a vector expression.
-    *
-    * By default, segmented scan is computed for the whole vector. If \e begin
-    * or \e end is set to a non-zero value, only elements in the sub-interval
-    * `[begin, end)` are scanned.
-    *
-    * \tparam Type The scan type - either \e Inclusive or \e Exclusive.
+    * The division is evaluated element-wise. The vector expression must
+    * either evaluate to a scalar or a vector of the same size as this vector
+    * view.
     *
-    * \param expression A vector expression for which scan is computed and
-    *                   stored in this vector.
-    * \param flags A binary array where ones indicate the beginning of each
-    *              segment.
-    * \param begin The beginning of the vector sub-interval. It is 0 by
-    *              default.
-    * \param end The end of the vector sub-interval. The default value is 0
-    *            which is, however, replaced with the array size.
+    * \param expression Reference to a vector expression.
+    * \return Reference to this vector.
     */
-   template< Algorithms::ScanType Type = Algorithms::ScanType::Inclusive,
-             typename VectorExpression,
-             typename FlagsArray >
-   void segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin = 0, IndexType end = 0 );
+   template< typename VectorExpression >
+   VectorView& operator%=( const VectorExpression& expression );
 };
 
 // Enable expression templates for VectorView
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 2c1cd02c8163db83760907a50aeafaf0c8e5404d..0d7d13b652895095a5cbbd5a88e26be26cbb9d59 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -12,7 +12,6 @@
 
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Containers/detail/VectorAssignment.h>
-#include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
 namespace Containers {
@@ -105,53 +104,13 @@ operator/=( const VectorExpression& expression )
 template< typename Real,
           typename Device,
           typename Index >
-   template< Algorithms::ScanType Type >
-void
-VectorView< Real, Device, Index >::
-scan( IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::Scan< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename FlagsArray >
-void
-VectorView< Real, Device, Index >::
-segmentedScan( FlagsArray& flags, IndexType begin, IndexType end )
-{
-   if( end == 0 )
-      end = this->getSize();
-   Algorithms::SegmentedScan< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression >
-void
-VectorView< Real, Device, Index >::
-scan( const VectorExpression& expression, IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError( "Scan (prefix sum) with vector expressions is not implemented." );
-}
-
-template< typename Real,
-          typename Device,
-          typename Index >
-   template< Algorithms::ScanType Type,
-             typename VectorExpression,
-             typename FlagsArray >
-void
+   template< typename VectorExpression >
+VectorView< Real, Device, Index >&
 VectorView< Real, Device, Index >::
-segmentedScan( const VectorExpression& expression, FlagsArray& flags, IndexType begin, IndexType end )
+operator%=( const VectorExpression& expression )
 {
-   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) with vector expressions is not implemented." );
+   detail::VectorAssignmentWithOperation< VectorView, VectorExpression >::modulo( *this, expression );
+   return *this;
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/detail/VectorAssignment.h b/src/TNL/Containers/detail/VectorAssignment.h
index 5a36d971ce9d93d5500f214ec1dda507cf6e3a2b..5c3815a9c0e0b0afa2bdc0184f9289b8f0585b4a 100644
--- a/src/TNL/Containers/detail/VectorAssignment.h
+++ b/src/TNL/Containers/detail/VectorAssignment.h
@@ -27,7 +27,7 @@ template< typename Vector,
 struct VectorAssignment;
 
 /**
- * \brief Vector assignment with an operation: +=, -=, *=, /=
+ * \brief Vector assignment with an operation: +=, -=, *=, /=, %=
  */
 template< typename Vector,
           typename T,
@@ -87,7 +87,6 @@ struct VectorAssignment< Vector, T, false >
    __cuda_callable__
    static void assignStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] = t;
    }
@@ -246,6 +245,31 @@ struct VectorAssignmentWithOperation< Vector, T, true, false >
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
    }
+
+   __cuda_callable__
+   static void moduloStatic( Vector& v, const T& t )
+   {
+      TNL_ASSERT_EQ( v.getSize(), t.getSize(), "The sizes of the vectors must be equal." );
+      for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
+         v[ i ] %= t[ i ];
+   }
+
+   static void modulo( Vector& v, const T& t )
+   {
+      static_assert( std::is_same< typename Vector::DeviceType, typename T::DeviceType >::value,
+                     "Cannot assign an expression to a vector allocated on a different device." );
+      TNL_ASSERT_EQ( v.getSize(), t.getSize(), "The sizes of the vectors must be equal." );
+      using RealType = typename Vector::RealType;
+      using DeviceType = typename Vector::DeviceType;
+      using IndexType = typename Vector::IndexType;
+
+      RealType* data = v.getData();
+      auto divide = [=] __cuda_callable__ ( IndexType i )
+      {
+         data[ i ] %= t[ i ];
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
+   }
 };
 
 /**
@@ -259,7 +283,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void additionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] += t;
    }
@@ -281,7 +304,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void subtractionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] -= t;
    }
@@ -303,7 +325,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void multiplicationStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] *= t;
    }
@@ -325,7 +346,6 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
    __cuda_callable__
    static void divisionStatic( Vector& v, const T& t )
    {
-      TNL_ASSERT_GT( v.getSize(), 0, "Cannot assign value to empty vector." );
       for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
          v[ i ] /= t;
    }
@@ -343,6 +363,27 @@ struct VectorAssignmentWithOperation< Vector, T, false, false >
       };
       Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
    }
+
+   __cuda_callable__
+   static void moduloStatic( Vector& v, const T& t )
+   {
+      for( decltype( v.getSize() ) i = 0; i < v.getSize(); i++ )
+         v[ i ] %= t;
+   }
+
+   static void modulo( Vector& v, const T& t )
+   {
+      using RealType = typename Vector::RealType;
+      using DeviceType = typename Vector::DeviceType;
+      using IndexType = typename Vector::IndexType;
+
+      RealType* data = v.getData();
+      auto divide = [=] __cuda_callable__ ( IndexType i )
+      {
+         data[ i ] %= t;
+      };
+      Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, v.getSize(), divide );
+   }
 };
 
 } // namespace detail
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index da4aa93fcc57a19ab68ed46ff06ee5fdf042a190..e39276484506e2fe4d280a43c0d940c65e5a3168 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -18,33 +18,170 @@
 namespace TNL {
 
 /**
- * \brief Extension of \ref std::plus<void> for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `x + y`.
  */
 struct Plus : public std::plus< void >
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
-   static constexpr T getIdempotent() { return 0; };
+   static constexpr T getIdentity() { return 0; }
 };
 
 /**
- * \brief Extension of \ref std::multiplies<void> for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `x - y`.
+ */
+using Minus = std::minus< void >;
+
+/**
+ * \brief Function object implementing `x * y`.
  */
 struct Multiplies : public std::multiplies< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 1; }
+};
+
+/**
+ * \brief Function object implementing `x / y`.
+ */
+using Divides = std::divides< void >;
+
+/**
+ * \brief Function object implementing `x % y`.
+ */
+using Modulus = std::modulus< void >;
+
+/**
+ * \brief Function object implementing `+x`.
+ */
+struct UnaryPlus
 {
    template< typename T >
-   static constexpr T getIdempotent() { return 1; };
+   constexpr auto operator()( const T& x ) const -> decltype( +x )
+   {
+      return +x;
+   }
 };
 
 /**
- * \brief Function object implementing `min(x, y)` for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `-x`.
+ */
+using UnaryMinus = std::negate< void >;
+
+/**
+ * \brief Function object implementing `x && y`.
+ */
+struct LogicalAnd : public std::logical_and< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
+};
+
+/**
+ * \brief Function object implementing `x || y`.
+ */
+struct LogicalOr : public std::logical_or< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Function object implementing `!x`.
+ */
+using LogicalNot = std::logical_not< void >;
+
+/**
+ * \brief Extension of \ref std::bit_and<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitAnd : public std::bit_and< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return ~static_cast< T >( 0 ); }
+};
+
+/**
+ * \brief Extension of \ref std::bit_or<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitOr : public std::bit_or< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Extension of \ref std::bit_xor<void> for use with \ref TNL::Algorithms::reduce.
+ */
+struct BitXor : public std::bit_xor< void >
+{
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
+   template< typename T >
+   static constexpr T getIdentity() { return 0; }
+};
+
+/**
+ * \brief Function object implementing `~x`.
+ */
+using BitNot = std::bit_not< void >;
+
+/**
+ * \brief Function object implementing `min(x, y)`.
  */
 struct Min
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
 
-   template< typename Value >
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const
+   template< typename T1, typename T2 >
+   constexpr auto operator()( const T1& lhs, const T2& rhs ) const
    {
       // use argument-dependent lookup and make TNL::min available for unqualified calls
       using TNL::min;
@@ -53,15 +190,25 @@ struct Min
 };
 
 /**
- * \brief Function object implementing `max(x, y)` for use with \ref TNL::Algorithms::reduce.
+ * \brief Function object implementing `max(x, y)`.
  */
 struct Max
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::lowest();
+   }
 
-   template< typename Value >
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const
+   template< typename T1, typename T2 >
+   constexpr auto operator()( const T1& lhs, const T2& rhs ) const
    {
       // use argument-dependent lookup and make TNL::max available for unqualified calls
       using TNL::max;
@@ -70,12 +217,22 @@ struct Max
 };
 
 /**
- * \brief Extension of \ref std::min<void> for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Function object implementing `argmin(x, y, i, j)` for use with \ref TNL::Algorithms::reduceWithArgument.
  */
 struct MinWithArg
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::max();
+   }
 
    template< typename Value, typename Index >
    constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
@@ -93,12 +250,22 @@ struct MinWithArg
 };
 
 /**
- * \brief Extension of \ref std::max<void> for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Function object implementing `argmax(x, y, i, j)` for use with \ref TNL::Algorithms::reduceWithArgument.
  */
 struct MaxWithArg
 {
+   /**
+    * \brief Returns the [identity element](https://en.wikipedia.org/wiki/Identity_element) of the operation.
+    *
+    * Suitable for \ref TNL::Algorithms::reduce.
+    */
    template< typename T >
-   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+   static constexpr T getIdentity()
+   {
+      static_assert( std::numeric_limits< T >::is_specialized,
+                     "std::numeric_limits is not specialized for the requested type" );
+      return std::numeric_limits< T >::lowest();
+   }
 
    template< typename Value, typename Index >
    constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
@@ -115,40 +282,68 @@ struct MaxWithArg
    }
 };
 
-/**
- * \brief Extension of \ref std::logical_and<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct LogicalAnd : public std::logical_and< void >
-{
-   template< typename T >
-   static constexpr T getIdempotent() { return true; };
-};
+#define TNL_MAKE_UNARY_FUNCTIONAL(name, function)                       \
+   struct name                                                          \
+   {                                                                    \
+      template< typename T >                                            \
+      __cuda_callable__                                                 \
+      auto operator()( const T& x ) const -> decltype( function( x ) )  \
+      {                                                                 \
+         return function( x );                                          \
+      }                                                                 \
+   };                                                                   \
 
-/**
- * \brief Extension of \ref std::logical_or<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct LogicalOr : public std::logical_or< void >
-{
-   template< typename T >
-   static constexpr T getIdempotent() { return false; };
-};
+#define TNL_MAKE_BINARY_FUNCTIONAL(name, function)                                  \
+   struct name                                                                      \
+   {                                                                                \
+      template< typename T1, typename T2 >                                          \
+      __cuda_callable__                                                             \
+      auto operator()( const T1& x, const T2& y ) const -> decltype( pow( x, y ) )  \
+      {                                                                             \
+         return pow( x, y );                                                        \
+      }                                                                             \
+   };                                                                               \
 
-/**
- * \brief Extension of \ref std::bit_and<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct BitAnd : public std::bit_and< void >
-{
-   template< typename T >
-   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
-};
+TNL_MAKE_UNARY_FUNCTIONAL( Abs, abs )
+TNL_MAKE_UNARY_FUNCTIONAL( Exp, exp )
+TNL_MAKE_UNARY_FUNCTIONAL( Sqrt, sqrt )
+TNL_MAKE_UNARY_FUNCTIONAL( Cbrt, cbrt )
+TNL_MAKE_UNARY_FUNCTIONAL( Log, log )
+TNL_MAKE_UNARY_FUNCTIONAL( Log10, log10 )
+TNL_MAKE_UNARY_FUNCTIONAL( Log2, log2 )
+TNL_MAKE_UNARY_FUNCTIONAL( Sin, sin )
+TNL_MAKE_UNARY_FUNCTIONAL( Cos, cos )
+TNL_MAKE_UNARY_FUNCTIONAL( Tan, tan )
+TNL_MAKE_UNARY_FUNCTIONAL( Asin, asin )
+TNL_MAKE_UNARY_FUNCTIONAL( Acos, acos )
+TNL_MAKE_UNARY_FUNCTIONAL( Atan, atan )
+TNL_MAKE_UNARY_FUNCTIONAL( Sinh, sinh )
+TNL_MAKE_UNARY_FUNCTIONAL( Cosh, cosh )
+TNL_MAKE_UNARY_FUNCTIONAL( Tanh, tanh )
+TNL_MAKE_UNARY_FUNCTIONAL( Asinh, asinh )
+TNL_MAKE_UNARY_FUNCTIONAL( Acosh, acosh )
+TNL_MAKE_UNARY_FUNCTIONAL( Atanh, atanh )
+TNL_MAKE_UNARY_FUNCTIONAL( Floor, floor )
+TNL_MAKE_UNARY_FUNCTIONAL( Ceil, ceil )
+TNL_MAKE_UNARY_FUNCTIONAL( Sign, sign )
 
-/**
- * \brief Extension of \ref std::bit_or<void> for use with \ref TNL::Algorithms::reduce.
- */
-struct BitOr : public std::bit_or< void >
+TNL_MAKE_BINARY_FUNCTIONAL( Pow, pow )
+
+#undef TNL_MAKE_UNARY_FUNCTIONAL
+#undef TNL_MAKE_BINARY_FUNCTIONAL
+
+template< typename ResultType >
+struct Cast
 {
-   template< typename T >
-   static constexpr T getIdempotent() { return 0; };
+   struct Operation
+   {
+      template< typename T >
+      __cuda_callable__
+      auto operator()( const T& a ) const -> ResultType
+      {
+         return static_cast<ResultType>( a );
+      }
+   };
 };
 
 } // namespace TNL
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index cb583c03c14eb2c388b59ec19e0a84fd4ccb958c..220f6ad547d9fdb2b520b76393881565a04c8b5a 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -27,20 +27,11 @@ namespace TNL {
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! HasSubscriptOperator<T1>::value && ! HasSubscriptOperator<T2>::value, bool > = true >
-__cuda_callable__
-ResultType min( const T1& a, const T2& b )
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
+constexpr ResultType min( const T1& a, const T2& b )
 {
-#if __cplusplus >= 201402L
    // std::min is constexpr since C++14 so it can be reused directly
    return std::min( (ResultType) a, (ResultType) b );
-#else
- #if defined(__CUDA_ARCH__)
-   return ::min( (ResultType) a, (ResultType) b );
- #else
-   return std::min( (ResultType) a, (ResultType) b );
- #endif
-#endif
 }
 
 /**
@@ -49,8 +40,7 @@ ResultType min( const T1& a, const T2& b )
  * The inputs are folded with the \ref min function from the left to the right.
  */
 template< typename T1, typename T2, typename T3, typename... Ts >
-__cuda_callable__
-typename std::common_type< T1, T2, T3, Ts... >::type
+constexpr typename std::common_type< T1, T2, T3, Ts... >::type
 min( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
 {
    return min( min( std::forward<T1>(val1), std::forward<T2>(val2) ),
@@ -65,20 +55,11 @@ min( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! HasSubscriptOperator<T1>::value && ! HasSubscriptOperator<T2>::value, bool > = true >
-__cuda_callable__
-ResultType max( const T1& a, const T2& b )
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
+constexpr ResultType max( const T1& a, const T2& b )
 {
-#if __cplusplus >= 201402L
    // std::max is constexpr since C++14 so it can be reused directly
    return std::max( (ResultType) a, (ResultType) b );
-#else
- #if defined(__CUDA_ARCH__)
-   return ::max( (ResultType) a, (ResultType) b );
- #else
-   return std::max( (ResultType) a, (ResultType) b );
- #endif
-#endif
 }
 
 /**
@@ -99,7 +80,7 @@ max( T1&& val1, T2&& val2, T3&& val3, Ts&&... vs )
  * \brief This function returns absolute value of given number \e n.
  */
 template< class T,
-          std::enable_if_t< ! std::is_unsigned<T>::value && ! std::is_class<T>::value, bool > = true >
+          std::enable_if_t< std::is_arithmetic<T>::value && ! std::is_unsigned<T>::value, bool > = true >
 __cuda_callable__
 T abs( const T& n )
 {
@@ -169,7 +150,7 @@ ResultType argAbsMax( const T1& a, const T2& b )
  */
 template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type,
           // enable_if is necessary to avoid ambiguity in vector expressions
-          std::enable_if_t< ! std::is_class<T1>::value && ! std::is_class<T2>::value, bool > = true >
+          std::enable_if_t< std::is_arithmetic<T1>::value && std::is_arithmetic<T2>::value, bool > = true >
 __cuda_callable__
 ResultType pow( const T1& base, const T2& exp )
 {
diff --git a/src/TNL/Matrices/DenseMatrix.h b/src/TNL/Matrices/DenseMatrix.h
index 8d6e5d771abbbf9f558158082782a5ad8a5f2a86..a65c12d80bcb0475198714b5adf597d45e7a926c 100644
--- a/src/TNL/Matrices/DenseMatrix.h
+++ b/src/TNL/Matrices/DenseMatrix.h
@@ -701,7 +701,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceRows.cpp
@@ -709,7 +711,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -728,7 +730,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceRows.cpp
@@ -736,7 +740,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -753,7 +757,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceAllRows.cpp
@@ -761,7 +767,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
@@ -778,7 +784,9 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixExample_reduceAllRows.cpp
@@ -786,7 +794,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include DenseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Computes product of matrix and vector.
@@ -964,7 +972,7 @@ class DenseMatrix : public Matrix< Real, Device, Index, RealAllocator >
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the dense matrix.
  * \return  reference to the stream.
diff --git a/src/TNL/Matrices/DenseMatrix.hpp b/src/TNL/Matrices/DenseMatrix.hpp
index d330b833360b67d634a523a2418a9a5f060af8be..e3a9751673e72f77ffda8021728454f078d8fb52 100644
--- a/src/TNL/Matrices/DenseMatrix.hpp
+++ b/src/TNL/Matrices/DenseMatrix.hpp
@@ -355,9 +355,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -368,9 +368,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -381,9 +381,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -394,9 +394,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -1375,7 +1375,7 @@ template< typename Real,
           ElementsOrganization Organization,
           typename RealAllocator >
 std::ostream& operator<< ( std::ostream& str, const DenseMatrix< Real, Device, Index, Organization, RealAllocator >& matrix )
-{ 
+{
    matrix.print( str );
    return str;
 }
diff --git a/src/TNL/Matrices/DenseMatrixView.h b/src/TNL/Matrices/DenseMatrixView.h
index 89ace2d0697a086de8311a677c40ae97e1759ea3..ea7f6dbe74d89fee10596363fccb710089486206 100644
--- a/src/TNL/Matrices/DenseMatrixView.h
+++ b/src/TNL/Matrices/DenseMatrixView.h
@@ -418,7 +418,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceRows.cpp
@@ -426,7 +428,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -445,7 +447,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceRows.cpp
@@ -453,7 +457,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -470,7 +474,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceAllRows.cpp
@@ -478,7 +484,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows for constant instances.
@@ -495,7 +501,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/DenseMatrix/DenseMatrixViewExample_reduceAllRows.cpp
@@ -503,7 +511,7 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
        * \include DenseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 4a999d76b07ce9f4b1659609edafbc9683cca336..6ad36f27afd64b5863f65e90939a7dd89671833d 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -290,14 +290,14 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
    auto values_view = this->values.getView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -307,14 +307,14 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
    const auto values_view = this->values.getConstView();
    auto fetch_ = [=] __cuda_callable__ ( IndexType rowIdx, IndexType columnIdx, IndexType globalIdx, bool& compute ) mutable -> decltype( fetch( IndexType(), IndexType(), RealType() ) ) {
          return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -324,9 +324,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -336,9 +336,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 DenseMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/LambdaMatrix.h b/src/TNL/Matrices/LambdaMatrix.h
index 511942f0195b1f8b0617d9dde6b5429c34cc4b25..01d3a0b9101bb5e95e0c044f4c39d70b750bd95e 100644
--- a/src/TNL/Matrices/LambdaMatrix.h
+++ b/src/TNL/Matrices/LambdaMatrix.h
@@ -388,7 +388,9 @@ class LambdaMatrix
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_reduceRows.cpp
@@ -396,7 +398,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on ALL matrix rows.
@@ -413,7 +415,9 @@ class LambdaMatrix
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/LambdaMatrix/LambdaMatrixExample_reduceAllRows.cpp
@@ -421,7 +425,7 @@ class LambdaMatrix
        * \include LambdaMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Computes product of matrix and vector.
diff --git a/src/TNL/Matrices/LambdaMatrix.hpp b/src/TNL/Matrices/LambdaMatrix.hpp
index f2cdb75749f828d57b91fe5219d236c96162e698..20d2ccbb10b7cf734eb9d7f1b285452d9fa5c416 100644
--- a/src/TNL/Matrices/LambdaMatrix.hpp
+++ b/src/TNL/Matrices/LambdaMatrix.hpp
@@ -262,7 +262,7 @@ template< typename MatrixElementsLambda,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using FetchType = decltype( fetch( IndexType(), IndexType(), RealType() ) );
 
@@ -272,13 +272,13 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, const Reduce& reduce,
    auto matrixElements = this->matrixElementsLambda;
    auto processRow = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
       const IndexType rowLength = rowLengths( rows, columns, rowIdx );
-      FetchType result( zero );
+      FetchType result = identity;
       for( IndexType localIdx = 0; localIdx < rowLength; localIdx++ )
       {
         IndexType elementColumn( 0 );
         RealType elementValue( 0.0 );
         matrixElements( rows, columns, rowIdx, localIdx, elementColumn, elementValue );
-        FetchType fetchValue( zero );
+        FetchType fetchValue = identity;
         if( elementValue != 0.0 )
             fetchValue = fetch( rowIdx, elementColumn, elementValue );
         result = reduce( result, fetchValue );
@@ -296,9 +296,9 @@ template< typename MatrixElementsLambda,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 LambdaMatrix< MatrixElementsLambda, CompressedRowLengthsLambda, Real, Device, Index >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename MatrixElementsLambda,
@@ -444,7 +444,7 @@ print( std::ostream& str ) const
 
 /**
  * \brief Insertion operator for dense matrix and output stream.
- * 
+ *
  * \param str is the output stream.
  * \param matrix is the lambda matrix.
  * \return reference to the stream.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.h b/src/TNL/Matrices/MultidiagonalMatrix.h
index e29796a1e12a544334a5a786e0896ef7fe4a83fc..d938a106298c298b090ad5bdaf67b57768e109a4 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.h
+++ b/src/TNL/Matrices/MultidiagonalMatrix.h
@@ -47,7 +47,7 @@ namespace Matrices {
  * are \f$\{-3,-1,0,1,3\}\f$. Advantage is that we do not store the column indexes
  * explicitly as it is in \ref SparseMatrix. This can reduce significantly the
  * memory requirements which also means better performance. See the following table
- * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix 
+ * for the storage requirements comparison between \ref TNL::Matrices::MultidiagonalMatrix
  * and \ref TNL::Matrices::SparseMatrix.
  *
  *  Real   | Index     |      SparseMatrix    | MultidiagonalMatrix | Ratio
@@ -614,7 +614,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceRows.cpp
@@ -622,7 +624,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -641,7 +643,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceRows.cpp
@@ -649,7 +653,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -666,7 +670,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceAllRows.cpp
@@ -674,7 +680,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -691,7 +697,9 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_reduceAllRows.cpp
@@ -699,7 +707,7 @@ class MultidiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include MultidiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over matrix rows for constant instances.
diff --git a/src/TNL/Matrices/MultidiagonalMatrix.hpp b/src/TNL/Matrices/MultidiagonalMatrix.hpp
index 7e6ac450f54fc50e772e51a18bd2a8fb98e406e0..99534b92b22e3348c73d90816136c4c4e65e945e 100644
--- a/src/TNL/Matrices/MultidiagonalMatrix.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrix.hpp
@@ -477,9 +477,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -491,9 +491,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -505,9 +505,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -519,9 +519,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrix< Real, Device, Index, Organization, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.h b/src/TNL/Matrices/MultidiagonalMatrixView.h
index bc3de664b9142bd9ad1648a8b3bfa0e5fb7bd94b..3575602136cc1596dd461dab2e3ad302645dd535 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.h
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.h
@@ -376,7 +376,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceRows.cpp
@@ -384,7 +386,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
@@ -403,7 +405,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceRows.cpp
@@ -411,7 +415,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -428,7 +432,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceAllRows.cpp
@@ -436,7 +442,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -453,7 +459,9 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_reduceAllRows.cpp
@@ -461,7 +469,7 @@ class MultidiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include MultidiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 2b83fc87bc87b3ce6b1e18e137310b127f5dd4d5..03bc6907ef7f6725ceee27a17ef5cf8f8d48be24 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -356,7 +356,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
@@ -364,9 +364,8 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep&
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
@@ -385,7 +384,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
@@ -393,9 +392,8 @@ reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep&
    const IndexType diagonalsCount = this->diagonalsOffsets.getSize();
    const IndexType columns = this->getColumns();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       for( IndexType localIdx = 0; localIdx < diagonalsCount; localIdx++ )
       {
          const IndexType columnIdx = rowIdx + diagonalsOffsets_view[ localIdx ];
@@ -414,9 +412,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -426,9 +424,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 MultidiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrix.h b/src/TNL/Matrices/SparseMatrix.h
index b9b7dceae153a0b6e3572c75c0cb4950566becff..237417d66aa903b176b1e502a7f92cec99c81049 100644
--- a/src/TNL/Matrices/SparseMatrix.h
+++ b/src/TNL/Matrices/SparseMatrix.h
@@ -617,7 +617,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
@@ -625,7 +627,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -644,7 +646,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceRows.cpp
@@ -652,7 +656,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -669,7 +673,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
@@ -677,7 +683,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -694,7 +700,9 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixExample_reduceAllRows.cpp
@@ -702,7 +710,7 @@ class SparseMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include SparseMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for parallel iteration over matrix elements of given rows for constant instances.
diff --git a/src/TNL/Matrices/SparseMatrix.hpp b/src/TNL/Matrices/SparseMatrix.hpp
index 6f701a3ea7d3680091f3012fc0921e9ca85e6854..a183b38c1cbb67e1390ba0a190ba4ea59d9358d4 100644
--- a/src/TNL/Matrices/SparseMatrix.hpp
+++ b/src/TNL/Matrices/SparseMatrix.hpp
@@ -10,9 +10,7 @@
 
 #pragma once
 
-#include <functional>
 #include <sstream>
-#include <TNL/Algorithms/Reduction.h>
 #include <TNL/Matrices/SparseMatrix.h>
 
 namespace TNL {
@@ -539,9 +537,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -555,9 +553,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
-   this->view.reduceRows( begin, end, fetch, reduce, keep, zero );
+   this->view.reduceRows( begin, end, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -571,9 +569,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -587,9 +585,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrix< Real, Device, Index, MatrixType, Segments, ComputeReal, RealAllocator, IndexAllocator >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.h b/src/TNL/Matrices/SparseMatrixView.h
index 8651ad1c3993bb8bf4abbe8afc7d9d1fc1ee9948..40a89b628a4d0474f811dfb16ae62d05407d5302 100644
--- a/src/TNL/Matrices/SparseMatrixView.h
+++ b/src/TNL/Matrices/SparseMatrixView.h
@@ -408,7 +408,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
@@ -416,7 +418,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows for constant instances.
@@ -435,7 +437,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceRows.cpp
@@ -443,7 +447,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -460,7 +464,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
@@ -468,7 +474,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -485,7 +491,9 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/SparseMatrix/SparseMatrixViewExample_reduceAllRows.cpp
@@ -493,7 +501,7 @@ class SparseMatrixView : public MatrixView< Real, Device, Index >
        * \include SparseMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 02ef757c2831f5b95b29ab54900962d51ab14d7c..cf5e9771a5394a16e38d8897415f5433586fa7b4 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -12,7 +12,7 @@
 
 #include <functional>
 #include <TNL/Matrices/SparseMatrixView.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Algorithms/AtomicOperations.h>
 #include <TNL/Matrices/details/SparseMatrix.h>
 
@@ -504,7 +504,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero )
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity )
 {
    auto columns_view = this->columnIndexes.getView();
    auto values_view = this->values.getView();
@@ -518,9 +518,9 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          else
             return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       }
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -532,7 +532,7 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchValue >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& zero ) const
+reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchValue& identity ) const
 {
    const auto columns_view = this->columnIndexes.getConstView();
    const auto values_view = this->values.getConstView();
@@ -547,9 +547,9 @@ reduceRows( IndexType begin, IndexType end, Fetch& fetch, const Reduce& reduce,
          else
             return fetch( rowIdx, columnIdx, values_view[ globalIdx ] );
       }
-      return zero;
+      return identity;
    };
-   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, zero );
+   this->segments.segmentsReduction( begin, end, fetch_, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -561,9 +561,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -575,9 +575,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 SparseMatrixView< Real, Device, Index, MatrixType, SegmentsView, ComputeReal >::
-reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, const Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrix.h b/src/TNL/Matrices/TridiagonalMatrix.h
index c970ff9b7f837c14bfe10e8d57d2dee54ac4b721..b74e0dcb9d8e3d609c593a40f3ac5986f3dd8ea5 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.h
+++ b/src/TNL/Matrices/TridiagonalMatrix.h
@@ -506,7 +506,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceRows.cpp
@@ -514,7 +516,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on matrix rows of constant matrix instances.
@@ -533,7 +535,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceRows.cpp
@@ -541,7 +545,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType begin, IndexType end, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -560,7 +564,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceAllRows.cpp
@@ -568,7 +574,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows of constant matrix instances.
@@ -587,7 +593,9 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixExample_reduceAllRows.cpp
@@ -595,7 +603,7 @@ class TridiagonalMatrix : public Matrix< Real, Device, Index, RealAllocator >
        * \include TridiagonalMatrixExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for iteration over matrix rows for constant instances.
diff --git a/src/TNL/Matrices/TridiagonalMatrix.hpp b/src/TNL/Matrices/TridiagonalMatrix.hpp
index 87a508a9cdb4de7cdbced0e4b3ea371594a55325..1841df5c84eb91dd805a657eb7862538a06bc566 100644
--- a/src/TNL/Matrices/TridiagonalMatrix.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrix.hpp
@@ -348,9 +348,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -361,9 +361,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( first, last, fetch, reduce, keep, zero );
+   this->view.reduceRows( first, last, fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -374,9 +374,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -387,9 +387,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrix< Real, Device, Index, Organization, RealAllocator >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, zero );
+   this->view.reduceRows( 0, this->getRows(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.h b/src/TNL/Matrices/TridiagonalMatrixView.h
index be2926934f46d794858763c31c045731a0b51e4c..e05a8b05971b2fed4492a58b169474aec6dcdf78 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.h
+++ b/src/TNL/Matrices/TridiagonalMatrixView.h
@@ -363,7 +363,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceRows.cpp
@@ -371,7 +373,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on matrix rows.
@@ -390,7 +392,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceRows.cpp
@@ -398,7 +402,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for performing general reduction on all matrix rows for constant instances.
@@ -415,7 +419,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceAllRows.cpp
@@ -423,7 +429,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const;
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const;
 
       /**
        * \brief Method for performing general reduction on all matrix rows.
@@ -440,7 +446,9 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \param fetch is an instance of lambda function for data fetch.
        * \param reduce is an instance of lambda function for reduction.
        * \param keep in an instance of lambda function for storing results.
-       * \param zero is zero of given reduction operation also known as idempotent element.
+       * \param identity is the [identity element](https://en.wikipedia.org/wiki/Identity_element)
+       *                 for the reduction operation, i.e. element which does not
+       *                 change the result of the reduction.
        *
        * \par Example
        * \include Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_reduceAllRows.cpp
@@ -448,7 +456,7 @@ class TridiagonalMatrixView : public MatrixView< Real, Device, Index >
        * \include TridiagonalMatrixViewExample_reduceAllRows.out
        */
       template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
-      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero );
+      void reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity );
 
       /**
        * \brief Method for iteration over all matrix rows for constant instances.
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 5e7bfe7567f67527d46667a7e8d4420a0dd8b31d..cf510bf8ccdcb9548ed8c7feb7795cb969fa8b11 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -279,14 +279,13 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ ) const
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    const auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       if( rowIdx == 0 )
       {
          sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
@@ -323,14 +322,13 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero_ )
+reduceRows( IndexType first, IndexType last, Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
    using Real_ = decltype( fetch( IndexType(), IndexType(), RealType() ) );
    auto values_view = this->values.getConstView();
    const auto indexer = this->indexer;
-   const auto zero = zero_;
    auto f = [=] __cuda_callable__ ( IndexType rowIdx ) mutable {
-      Real_ sum( zero );
+      Real_ sum = identity;
       if( rowIdx == 0 )
       {
          sum = reduce( sum, fetch( 0, 1, values_view[ indexer.getGlobalIndex( 0, 1 ) ] ) );
@@ -367,9 +365,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero ) const
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity ) const
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
@@ -379,9 +377,9 @@ template< typename Real,
    template< typename Fetch, typename Reduce, typename Keep, typename FetchReal >
 void
 TridiagonalMatrixView< Real, Device, Index, Organization >::
-reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& zero )
+reduceAllRows( Fetch& fetch, Reduce& reduce, Keep& keep, const FetchReal& identity )
 {
-   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, zero );
+   this->reduceRows( 0, this->indexer.getNonemptyRowsCount(), fetch, reduce, keep, identity );
 }
 
 template< typename Real,
diff --git a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
index 36f28ba458b67e872f9ea7d317f3654fb019215d..0353ded098ecdf0437c0c8cadcb6c9a47fee72e6 100644
--- a/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
+++ b/src/TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h
@@ -12,6 +12,7 @@
 
 #pragma once
 
+#include <TNL/Algorithms/scan.h>
 #include <TNL/Containers/ByteArraySynchronizer.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Matrices/DenseMatrix.h>
@@ -383,7 +384,7 @@ public:
             // scan the rowPointers array to convert
             Containers::VectorView< GlobalIndexType, Devices::Host, GlobalIndexType > rowPointersView;
             rowPointersView.bind( recv_rowPointers );
-            rowPointersView.template scan< Algorithms::ScanType::Exclusive >();
+            Algorithms::inplaceExclusiveScan( rowPointersView );
          }
 
          // allocate column indices
diff --git a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
index 120cadf808f7d4171d80e4f7ac375939ad1caf17..cfe5a9246bcf7db4feb391ac9c2786f351aabbbe 100644
--- a/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
+++ b/src/TNL/Meshes/DistributedMeshes/distributeSubentities.h
@@ -14,6 +14,8 @@
 
 #include <TNL/Meshes/DistributedMeshes/DistributedMeshSynchronizer.h>
 #include <TNL/Meshes/MeshDetails/layers/EntityTags/Traits.h>
+#include <TNL/Algorithms/scan.h>
+#include <TNL/Algorithms/contains.h>
 
 namespace TNL {
 namespace Meshes {
@@ -238,7 +240,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
                      globalOffsets.getData(), 1,
                      mesh.getCommunicationGroup() );
    }
-   globalOffsets.template scan< Algorithms::ScanType::Exclusive >();
+   Algorithms::inplaceExclusiveScan( globalOffsets );
 
    // 3. assign global indices to the local entities and a padding index to ghost entities
    //    (later we can check the padding index to know if an index was set or not)
@@ -390,7 +392,7 @@ distributeSubentities( DistributedMesh& mesh, bool preferHighRanks = true )
       if( all_done )
          break;
    }
-   if( mesh.template getGlobalIndices< Dimension >().containsValue( padding_index ) )
+   if( Algorithms::contains( mesh.template getGlobalIndices< Dimension >(), padding_index ) )
       throw std::runtime_error( "some global indices were left unset" );
 
    // 7. reorder the entities to make sure that global indices are sorted
diff --git a/src/TNL/TypeInfo.h b/src/TNL/TypeInfo.h
index 61377fbb8593ce9659c75dc355ad8abfe0838333..36940e8681154926f56de4de38616f8eefac174f 100644
--- a/src/TNL/TypeInfo.h
+++ b/src/TNL/TypeInfo.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <typeinfo>
+#include <type_traits>
 #include <string>
 
 #if defined( __has_include )
@@ -27,11 +28,10 @@
    #include <cstdlib>  // std::free
 #endif
 
-#include <TNL/TypeTraits.h>
 #include <TNL/String.h>
 
 namespace TNL {
-namespace __getType_impl {
+namespace detail {
 
 inline std::string
 demangle( const char* name )
@@ -49,7 +49,34 @@ demangle( const char* name )
    return name;
 }
 
-} // namespace __getType_impl
+/**
+ * \brief Type trait for checking if T has a static getSerializationType method.
+ */
+template< typename T >
+class HasStaticGetSerializationType
+{
+private:
+   template< typename U >
+   static constexpr auto check(U*)
+   -> typename
+      std::enable_if_t<
+         ! std::is_same<
+               decltype( U::getSerializationType() ),
+               void
+            >::value,
+         std::true_type
+      >;
+
+   template< typename >
+   static constexpr std::false_type check(...);
+
+   using type = decltype(check<std::decay_t<T>>(0));
+
+public:
+    static constexpr bool value = type::value;
+};
+
+} // namespace detail
 
 /**
  * \brief Returns a human-readable string representation of given type.
@@ -61,7 +88,7 @@ demangle( const char* name )
 template< typename T >
 String getType()
 {
-   return __getType_impl::demangle( typeid(T).name() );
+   return detail::demangle( typeid(T).name() );
 }
 
 /**
@@ -74,7 +101,7 @@ String getType()
 template< typename T >
 String getType( T&& obj )
 {
-   return __getType_impl::demangle( typeid(obj).name() );
+   return detail::demangle( typeid(obj).name() );
 }
 
 /**
@@ -87,7 +114,7 @@ String getType( T&& obj )
  * serialization type for multiple devices.
  */
 template< typename T,
-          std::enable_if_t< ! HasStaticGetSerializationType< T >::value, bool > = true >
+          std::enable_if_t< ! detail::HasStaticGetSerializationType< T >::value, bool > = true >
 String getSerializationType()
 {
    return getType< T >();
@@ -98,7 +125,7 @@ String getSerializationType()
  *        static \e getSerializationType method to override the default behaviour.
  */
 template< typename T,
-          std::enable_if_t< HasStaticGetSerializationType< T >::value, bool > = true >
+          std::enable_if_t< detail::HasStaticGetSerializationType< T >::value, bool > = true >
 String getSerializationType()
 {
    return T::getSerializationType();
diff --git a/src/TNL/TypeTraits.h b/src/TNL/TypeTraits.h
index c5d0fea363e60a9fd096fcf96058e3df3b2c5942..3a199e1b2cfbf1b9c6134d6f3b68a25dece4bd44 100644
--- a/src/TNL/TypeTraits.h
+++ b/src/TNL/TypeTraits.h
@@ -130,6 +130,20 @@ public:
     static constexpr bool value = type::value;
 };
 
+/**
+ * \brief Type trait for checking if T is a [scalar type](https://en.wikipedia.org/wiki/Scalar_(mathematics))
+ * (in the mathemtatical sense). Not to be confused with \ref std::is_scalar.
+ *
+ * For example, \ref std::is_arithmetic "arithmetic types" as defined by the STL
+ * are scalar types. TNL also provides additional scalar types, e.g. for
+ * extended precision arithmetics. Users may also define specializations of this
+ * trait class for their custom scalar types.
+ */
+template< typename T >
+struct IsScalarType
+: public std::is_arithmetic< T >
+{};
+
 /**
  * \brief Type trait for checking if T is an array type, e.g.
  *        \ref Containers::Array or \ref Containers::Vector.
@@ -226,33 +240,6 @@ struct IsViewType
             std::is_same< typename std::decay_t<T>::ViewType, T >::value >
 {};
 
-/**
- * \brief Type trait for checking if T has a static getSerializationType method.
- */
-template< typename T >
-class HasStaticGetSerializationType
-{
-private:
-   template< typename U >
-   static constexpr auto check(U*)
-   -> typename
-      std::enable_if_t<
-         ! std::is_same<
-               decltype( U::getSerializationType() ),
-               void
-            >::value,
-         std::true_type
-      >;
-
-   template< typename >
-   static constexpr std::false_type check(...);
-
-   using type = decltype(check<std::decay_t<T>>(0));
-
-public:
-    static constexpr bool value = type::value;
-};
-
 /**
  * \brief Type trait for checking if T has getCommunicationGroup method.
  */
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 31028036b233748f5a5d57fa4784354265b52156..aa14ae462a4c3ca32e0300129388768412dc62d4 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -2,16 +2,23 @@ ADD_SUBDIRECTORY( Segments )
 ADD_SUBDIRECTORY( Sorting )
 
 set( COMMON_TESTS
+         containsTest
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
-         ReductionTest
          staticForTest
          unrolledForTest
 )
 
-set( CPP_TESTS )
-set( CUDA_TESTS )
+set( CPP_TESTS
+         reduceTest
+         scanTest
+         SegmentedScanTest
+)
+set( CUDA_TESTS
+         reduceTestCuda
+         scanTestCuda
+)
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} ${COMMON_TESTS} )
 else()
@@ -32,3 +39,26 @@ if( BUILD_CUDA )
       add_test( ${target} ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${target}${CMAKE_EXECUTABLE_SUFFIX} )
    endforeach()
 endif()
+
+
+if( ${BUILD_MPI} )
+   ADD_EXECUTABLE( distributedScanTest distributedScanTest.cpp )
+   TARGET_COMPILE_OPTIONS( distributedScanTest PRIVATE ${CXX_TESTS_FLAGS} )
+   TARGET_LINK_LIBRARIES( distributedScanTest ${GTEST_BOTH_LIBRARIES} )
+
+   if( BUILD_CUDA )
+      CUDA_ADD_EXECUTABLE( distributedScanTestCuda distributedScanTestCuda.cu
+                           OPTIONS ${CXX_TESTS_FLAGS} )
+      TARGET_LINK_LIBRARIES( distributedScanTestCuda ${GTEST_BOTH_LIBRARIES} )
+   endif()
+
+   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+   ADD_TEST( NAME distributedScanTest COMMAND "mpirun" ${mpi_test_parameters})
+   ADD_TEST( NAME distributedScanTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTest${CMAKE_EXECUTABLE_SUFFIX}" )
+
+   if( BUILD_CUDA )
+      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+      ADD_TEST( NAME distributedScanTestCuda COMMAND "mpirun" ${mpi_test_parameters})
+      ADD_TEST( NAME distributedScanTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/distributedScanTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
+   endif()
+endif()
diff --git a/src/UnitTests/Algorithms/MemoryOperationsTest.h b/src/UnitTests/Algorithms/MemoryOperationsTest.h
index ebfb01f1bf62144d2ff950c4d3265cc7474dab3b..61938d82d6413ca6f32ddafa40338384e8b2ecdc 100644
--- a/src/UnitTests/Algorithms/MemoryOperationsTest.h
+++ b/src/UnitTests/Algorithms/MemoryOperationsTest.h
@@ -144,44 +144,6 @@ TYPED_TEST( MemoryOperationsTest, compareWithConversion_host )
    allocator2.deallocate( data2, ARRAY_TEST_SIZE );
 }
 
-TYPED_TEST( MemoryOperationsTest, containsValue_host )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using Allocator = Allocators::Host< ValueType >;
-
-   Allocator allocator;
-   ValueType* data = allocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = i % 10;
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( ( MemoryOperations< Devices::Host >::containsValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Host >::containsValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   allocator.deallocate( data, ARRAY_TEST_SIZE );
-}
-
-TYPED_TEST( MemoryOperationsTest, containsOnlyValue_host )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using Allocator = Allocators::Host< ValueType >;
-
-   Allocator allocator;
-   ValueType* data = allocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = i % 10;
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Host >::containsOnlyValue( data, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      data[ i ] = 10;
-   EXPECT_TRUE( ( MemoryOperations< Devices::Host >::containsOnlyValue( data, ARRAY_TEST_SIZE, (ValueType) 10 ) ) );
-
-   allocator.deallocate( data, ARRAY_TEST_SIZE );
-}
-
 
 #ifdef HAVE_CUDA
 TYPED_TEST( MemoryOperationsTest, allocateMemory_cuda )
@@ -353,58 +315,6 @@ TYPED_TEST( MemoryOperationsTest, compareWithConversions_cuda )
    cudaAllocator1.deallocate( deviceData, ARRAY_TEST_SIZE );
    cudaAllocator2.deallocate( deviceData2, ARRAY_TEST_SIZE );
 }
-
-TYPED_TEST( MemoryOperationsTest, containsValue_cuda )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using HostAllocator = Allocators::Host< ValueType >;
-   using CudaAllocator = Allocators::Cuda< ValueType >;
-
-   HostAllocator hostAllocator;
-   CudaAllocator cudaAllocator;
-   ValueType* hostData = hostAllocator.allocate( ARRAY_TEST_SIZE );
-   ValueType* deviceData = cudaAllocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = i % 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( ( MemoryOperations< Devices::Cuda >::containsValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Cuda >::containsValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   hostAllocator.deallocate( hostData, ARRAY_TEST_SIZE );
-   cudaAllocator.deallocate( deviceData, ARRAY_TEST_SIZE );
-}
-
-TYPED_TEST( MemoryOperationsTest, containsOnlyValue_cuda )
-{
-   using ValueType = typename TestFixture::ValueType;
-   using HostAllocator = Allocators::Host< ValueType >;
-   using CudaAllocator = Allocators::Cuda< ValueType >;
-
-   HostAllocator hostAllocator;
-   CudaAllocator cudaAllocator;
-   ValueType* hostData = hostAllocator.allocate( ARRAY_TEST_SIZE );
-   ValueType* deviceData = cudaAllocator.allocate( ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = i % 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( ( MemoryOperations< Devices::Cuda >::containsOnlyValue( deviceData, ARRAY_TEST_SIZE, (ValueType) i ) ) );
-
-   for( int i = 0; i < ARRAY_TEST_SIZE; i++ )
-      hostData[ i ] = 10;
-   MultiDeviceMemoryOperations< Devices::Cuda, Devices::Host >::copy( deviceData, hostData, ARRAY_TEST_SIZE );
-
-   EXPECT_TRUE( ( MemoryOperations< Devices::Cuda >::containsOnlyValue( deviceData, ARRAY_TEST_SIZE, (ValueType) 10 ) ) );
-
-   hostAllocator.deallocate( hostData, ARRAY_TEST_SIZE );
-   cudaAllocator.deallocate( deviceData, ARRAY_TEST_SIZE );
-}
 #endif // HAVE_CUDA
 #endif // HAVE_GTEST
 
diff --git a/src/UnitTests/Algorithms/ReductionTest.cpp b/src/UnitTests/Algorithms/ReductionTest.cpp
deleted file mode 100644
index 4d630e5f91eda483499442377b6cae3e1a88c6e0..0000000000000000000000000000000000000000
--- a/src/UnitTests/Algorithms/ReductionTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.cu b/src/UnitTests/Algorithms/ReductionTest.cu
deleted file mode 100644
index 4d630e5f91eda483499442377b6cae3e1a88c6e0..0000000000000000000000000000000000000000
--- a/src/UnitTests/Algorithms/ReductionTest.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
deleted file mode 100644
index b880642b8c0abb5549dd6e8fc818ec8947c99cec..0000000000000000000000000000000000000000
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/***************************************************************************
-                          ReductionTest.h  -  description
-                             -------------------
-    begin                : Jul 2, 2021
-    copyright            : (C) 2021 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-#include <TNL/Containers/Array.h>
-#include <TNL/Algorithms/Reduction.h>
-
-#ifdef HAVE_GTEST
-#include <gtest/gtest.h>
-#endif
-
-using namespace TNL;
-
-#ifdef HAVE_GTEST
-
-template< typename Device >
-void ReduceTest_sum()
-{
-   using Array = Containers::Array< int, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.setValue( 1 );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus{} );
-      EXPECT_EQ( res, size );
-   }
-}
-
-template< typename Device >
-void ReduceTest_min()
-{
-   using Array = Containers::Array< int, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min{} );
-      EXPECT_EQ( res, 1 );
-   }
-}
-
-template< typename Device >
-void ReduceTest_max()
-{
-   using Array = Containers::Array< int, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max{} );
-      EXPECT_EQ( res, size );
-   }
-}
-
-template< typename Device >
-void ReduceTest_minWithArg()
-{
-   using Array = Containers::Array< int, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg{} );
-      EXPECT_EQ( res.first, 1 );
-      EXPECT_EQ( res.second, 0 );
-   }
-}
-
-template< typename Device >
-void ReduceTest_maxWithArg()
-{
-   using Array = Containers::Array< int, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg{} );
-      EXPECT_EQ( res.first, size );
-      EXPECT_EQ( res.second, size - 1 );
-   }
-}
-
-template< typename Device >
-void ReduceTest_logicalAnd()
-{
-   using Array = Containers::Array< bool, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd{} );
-      EXPECT_EQ( res, false );
-   }
-}
-
-template< typename Device >
-void ReduceTest_logicalOr()
-{
-   using Array = Containers::Array< bool, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr{} );
-      EXPECT_EQ( res, true );
-   }
-}
-
-template< typename Device >
-void ReduceTest_bitAnd()
-{
-   using Array = Containers::Array< char, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd{} );
-      EXPECT_EQ( res, 1 );
-   }
-}
-
-template< typename Device >
-void ReduceTest_bitOr()
-{
-   using Array = Containers::Array< char, Device >;
-   Array a;
-   for( int size = 100; size <= 1000000; size *= 10 )
-   {
-      a.setSize( size );
-      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 << ( idx % 8 );} );
-      auto a_view = a.getView();
-
-      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr{} );
-      EXPECT_EQ( res, ( char ) 255 );
-   }
-}
-
-// test fixture for typed tests
-template< typename Device >
-class ReduceTest : public ::testing::Test
-{
-protected:
-   using DeviceType = Device;
-};
-
-// types for which ArrayTest is instantiated
-using DeviceTypes = ::testing::Types<
-   Devices::Host
-#ifdef HAVE_CUDA
-   ,Devices::Cuda
-#endif
-   >;
-
-TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
-
-TYPED_TEST( ReduceTest, sum )
-{
-   ReduceTest_sum< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, min )
-{
-   ReduceTest_min< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, max )
-{
-   ReduceTest_max< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, minWithArg )
-{
-   ReduceTest_minWithArg< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, maxWithArg )
-{
-   ReduceTest_maxWithArg< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, logicalAnd )
-{
-   ReduceTest_logicalAnd< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, logicalOr )
-{
-   ReduceTest_logicalOr< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, bitAnd )
-{
-   ReduceTest_bitAnd< typename TestFixture::DeviceType >();
-}
-
-TYPED_TEST( ReduceTest, bitOr )
-{
-   ReduceTest_bitOr< typename TestFixture::DeviceType >();
-}
-
-#endif
-
-#include "../main.h"
diff --git a/src/UnitTests/Algorithms/SegmentedScanTest.cpp b/src/UnitTests/Algorithms/SegmentedScanTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74d5a80a39821f823029354e5ac03a6066c464f1
--- /dev/null
+++ b/src/UnitTests/Algorithms/SegmentedScanTest.cpp
@@ -0,0 +1 @@
+#include "SegmentedScanTest.h"
diff --git a/src/UnitTests/Algorithms/SegmentedScanTest.h b/src/UnitTests/Algorithms/SegmentedScanTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b467ca88f05079cd296831948215c423ec64e2b
--- /dev/null
+++ b/src/UnitTests/Algorithms/SegmentedScanTest.h
@@ -0,0 +1,183 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+
+#include <TNL/Arithmetics/Quad.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/SegmentedScan.h>
+
+#include "gtest/gtest.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+
+// should be small enough to have fast tests, but larger than minGPUReductionDataSize
+// and large enough to require multiple CUDA blocks for reduction
+constexpr int ARRAY_TEST_SIZE = 10000;
+
+// test fixture for typed tests
+template< typename Array >
+class SegmentedScanTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+   using ViewType = ArrayView< typename Array::ValueType, typename Array::DeviceType, typename Array::IndexType >;
+};
+
+// types for which SegmentedScanTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< int,            Devices::Sequential, short >,
+   Array< long,           Devices::Sequential, short >,
+   Array< float,          Devices::Sequential, short >,
+   Array< double,         Devices::Sequential, short >,
+   //Array< Quad< float >,  Devices::Sequential, short >,
+   //Array< Quad< double >, Devices::Sequential, short >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< float,          Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< float,          Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< int,            Devices::Host, short >,
+   Array< long,           Devices::Host, short >,
+   Array< float,          Devices::Host, short >,
+   Array< double,         Devices::Host, short >,
+   //Array< Quad< float >,  Devices::Host, short >,
+   //Array< Quad< double >, Devices::Host, short >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< float,          Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< float,          Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+// TODO: segmented scan for CUDA is not implemented yet
+//#ifdef HAVE_CUDA
+//   Array< int,            Devices::Cuda, short >,
+//   Array< long,           Devices::Cuda, short >,
+//   Array< float,          Devices::Cuda, short >,
+//   Array< double,         Devices::Cuda, short >,
+//   //Array< Quad< float >,  Devices::Cuda, short >,
+//   //Array< Quad< double >, Devices::Cuda, short >,
+//   Array< int,            Devices::Cuda, int >,
+//   Array< long,           Devices::Cuda, int >,
+//   Array< float,          Devices::Cuda, int >,
+//   Array< double,         Devices::Cuda, int >,
+//   //Array< Quad< float >,  Devices::Cuda, int >,
+//   //Array< Quad< double >, Devices::Cuda, int >,
+//   Array< int,            Devices::Cuda, long >,
+//   Array< long,           Devices::Cuda, long >,
+//   Array< float,          Devices::Cuda, long >,
+//   Array< double,         Devices::Cuda, long >
+//   //Array< Quad< float >,  Devices::Cuda, long >,
+//   //Array< Quad< double >, Devices::Cuda, long >
+//#endif
+>;
+
+TYPED_TEST_SUITE( SegmentedScanTest, ArrayTypes );
+
+template< typename Array >
+void setLinearSequence( Array& array )
+{
+   using Value = typename Array::ValueType;
+   using Index = typename Array::IndexType;
+   auto f1 = [] __cuda_callable__ ( Index i, Value& value ) { value = i; };
+   array.forAllElements( f1 );
+}
+
+template< typename FlagsView >
+void setupFlags( FlagsView& flags )
+{
+   using Value = typename FlagsView::ValueType;
+   using Index = typename FlagsView::IndexType;
+   auto f1 = [] __cuda_callable__ ( Index i, Value& value ) { value = ( i % 5 == 0 ); };
+   flags.forAllElements( f1 );
+}
+
+TYPED_TEST( SegmentedScanTest, inclusive )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   using ViewType = typename TestFixture::ViewType;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using FlagsArrayType = Array< bool, DeviceType, IndexType >;
+   using FlagsViewType = ArrayView< bool, DeviceType, IndexType >;
+   const int size = ARRAY_TEST_SIZE;
+
+   ArrayType v( size );
+   ViewType v_view( v );
+
+   FlagsArrayType flags( size ), flags_copy( size );
+   FlagsViewType flags_view( flags );
+   setupFlags( flags_view );
+   flags_copy = flags_view;
+
+   v = 0;
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v.getElement( i ), 0 );
+   flags_view = flags_copy;
+
+   v = 1;
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v.getElement( i ), ( i % 5 ) + 1 );
+   flags_view = flags_copy;
+
+   setLinearSequence( v );
+   SegmentedScan< DeviceType >::perform( v, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 1; i < size; i++ )
+   {
+      if( flags.getElement( i ) )
+         EXPECT_EQ( v.getElement( i ), i );
+      else
+         EXPECT_EQ( v.getElement( i ) - v.getElement( i - 1 ), i );
+   }
+   flags_view = flags_copy;
+
+   v_view = 0;
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_view.getElement( i ), 0 );
+   flags_view = flags_copy;
+
+   v_view = 1;
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_view.getElement( i ), ( i % 5 ) + 1 );
+   flags_view = flags_copy;
+
+   setLinearSequence( v );
+   SegmentedScan< DeviceType >::perform( v_view, flags_view, 0, size, TNL::Plus{}, TNL::Plus::template getIdentity< ValueType >() );
+   for( int i = 1; i < size; i++ )
+   {
+      if( flags.getElement( i ) )
+         EXPECT_EQ( v_view.getElement( i ), i );
+      else
+         EXPECT_EQ( v_view.getElement( i ) - v_view.getElement( i - 1 ), i );
+   }
+}
+
+// TODO: test exclusive segmented scan
+
+#endif // HAVE_GTEST
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/containsTest.cpp b/src/UnitTests/Algorithms/containsTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7435d1282dfa07df6033460be3586993dfced558
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.cpp
@@ -0,0 +1 @@
+#include "containsTest.h"
diff --git a/src/UnitTests/Algorithms/containsTest.cu b/src/UnitTests/Algorithms/containsTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7435d1282dfa07df6033460be3586993dfced558
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.cu
@@ -0,0 +1 @@
+#include "containsTest.h"
diff --git a/src/UnitTests/Algorithms/containsTest.h b/src/UnitTests/Algorithms/containsTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..6598924ffcab2dee769666d60b130a9d4447c1e0
--- /dev/null
+++ b/src/UnitTests/Algorithms/containsTest.h
@@ -0,0 +1,106 @@
+/***************************************************************************
+                          ContainsTest.h  -  description
+                             -------------------
+    begin                : Jul 15, 2013
+    copyright            : (C) 2013 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/contains.h>
+
+#include "gtest/gtest.h"
+
+using namespace TNL;
+using namespace TNL::Algorithms;
+using namespace TNL::Containers;
+
+// test fixture for typed tests
+template< typename Array >
+class ContainsTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+};
+
+// types for which ContainsTest is instantiated
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< int,    Devices::Sequential, short >,
+   Array< long,   Devices::Sequential, short >,
+   Array< double, Devices::Sequential, short >,
+   Array< int,    Devices::Sequential, int >,
+   Array< long,   Devices::Sequential, int >,
+   Array< double, Devices::Sequential, int >,
+   Array< int,    Devices::Sequential, long >,
+   Array< long,   Devices::Sequential, long >,
+   Array< double, Devices::Sequential, long >,
+
+   Array< int,    Devices::Host, short >,
+   Array< long,   Devices::Host, short >,
+   Array< double, Devices::Host, short >,
+   Array< int,    Devices::Host, int >,
+   Array< long,   Devices::Host, int >,
+   Array< double, Devices::Host, int >,
+   Array< int,    Devices::Host, long >,
+   Array< long,   Devices::Host, long >,
+   Array< double, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< int,    Devices::Cuda, short >,
+   Array< long,   Devices::Cuda, short >,
+   Array< double, Devices::Cuda, short >,
+   Array< int,    Devices::Cuda, int >,
+   Array< long,   Devices::Cuda, int >,
+   Array< double, Devices::Cuda, int >,
+   Array< int,    Devices::Cuda, long >,
+   Array< long,   Devices::Cuda, long >,
+   Array< double, Devices::Cuda, long >
+#endif
+>;
+
+TYPED_TEST_SUITE( ContainsTest, ArrayTypes );
+
+TYPED_TEST( ContainsTest, contains )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType array;
+   array.setSize( 1024 );
+
+   for( int i = 0; i < array.getSize(); i++ )
+      array.setElement( i, i % 10 );
+
+   for( int i = 0; i < 10; i++ )
+      EXPECT_TRUE( contains( array, i ) );
+
+   for( int i = 10; i < 20; i++ )
+      EXPECT_FALSE( contains( array, i ) );
+}
+
+TYPED_TEST( ContainsTest, containsOnlyValue )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+
+   ArrayType array;
+   array.setSize( 1024 );
+
+   for( int i = 0; i < array.getSize(); i++ )
+      array.setElement( i, i % 10 );
+
+   for( int i = 0; i < 20; i++ )
+      EXPECT_FALSE( containsOnlyValue( array, i ) );
+
+   array.setValue( 100 );
+   EXPECT_TRUE( containsOnlyValue( array, 100 ) );
+}
+
+#endif // HAVE_GTEST
+
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/distributedScanTest.cpp b/src/UnitTests/Algorithms/distributedScanTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1b60321b48faa922d09cbd2b6af86f09a6e7c92
--- /dev/null
+++ b/src/UnitTests/Algorithms/distributedScanTest.cpp
@@ -0,0 +1 @@
+#include "distributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/distributedScanTest.h b/src/UnitTests/Algorithms/distributedScanTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..102f49dc6d3f89cde327417e39183dc62dbe913e
--- /dev/null
+++ b/src/UnitTests/Algorithms/distributedScanTest.h
@@ -0,0 +1,768 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/Containers/DistributedVectorView.h>
+#include <TNL/Containers/Partitioner.h>
+#include <TNL/Algorithms/distributedScan.h>
+
+#define DISTRIBUTED_VECTOR
+#include "../Containers/VectorHelperFunctions.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::detail;
+using namespace TNL::MPI;
+
+/*
+ * Light check of DistributedArray.
+ *
+ * - Number of processes is not limited.
+ * - Global size is hardcoded as 97 to force non-uniform distribution.
+ * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
+ */
+template< typename DistributedArray >
+class DistributedScanTest
+: public ::testing::Test
+{
+protected:
+   using ValueType = typename DistributedArray::ValueType;
+   using DeviceType = typename DistributedArray::DeviceType;
+   using IndexType = typename DistributedArray::IndexType;
+   using DistributedArrayType = DistributedArray;
+   using DistributedArrayView = Containers::DistributedArrayView< ValueType, DeviceType, IndexType >;
+   using DistributedVectorView = Containers::DistributedVectorView< ValueType, DeviceType, IndexType >;
+   using HostDistributedArrayType = typename DistributedArrayType::template Self< ValueType, Devices::Sequential >;
+   using LocalRangeType = typename DistributedArray::LocalRangeType;
+   using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
+   using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
+
+   const MPI_Comm group = AllGroup();
+
+   DistributedArrayType a, b, c;
+   DistributedArrayView a_view, b_view, c_view;
+   DistributedVectorView av_view, bv_view, cv_view;
+   HostDistributedArrayType array_host, input_host, expected_host;
+
+   const int rank = GetRank(group);
+   const int nproc = GetSize(group);
+
+   // should be small enough to have fast tests, but large enough to test
+   // scan with multiple CUDA grids
+   // also should be a prime number to cause non-uniform distribution of the work
+   const int globalSize = 9377 * nproc;
+
+   LocalRangeType localRange;
+
+   // some arbitrary value (but must be 0 if not distributed)
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
+   DistributedScanTest()
+   {
+      resetWorkingArrays();
+      input_host = a;
+      input_host.setSynchronizer( std::make_shared<HostSynchronizer>( a.getLocalRange(), ghosts / 2, group ) );
+      expected_host = input_host;
+   }
+
+   void resetWorkingArrays()
+   {
+      localRange = Partitioner< IndexType >::splitRange( globalSize, group );
+      a.setDistribution( localRange, ghosts, globalSize, group );
+      a.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
+
+      a.setValue( -1 );
+      c = b = a;
+      a_view.bind( a );
+      b_view.bind( b );
+      c_view.bind( c );
+      av_view.bind( a );
+      bv_view.bind( b );
+      cv_view.bind( c );
+
+      // make sure that we perform tests with multiple CUDA grids
+#ifdef HAVE_CUDA
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+      }
+#endif
+   }
+
+   template< Algorithms::detail::ScanType ScanType >
+   void checkResult( const DistributedArrayType& array, bool check_cuda_grids = true )
+   {
+#ifdef HAVE_CUDA
+      // skip the check for too small arrays
+      if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) {
+         // we don't care which kernel launcher was actually used
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
+         EXPECT_GT( gridsCount, 1 );
+      }
+#endif
+
+      array_host = array;
+
+      for( int i = a.getLocalRange().getBegin(); i < a.getLocalRange().getEnd(); i++ )
+         EXPECT_EQ( array_host[ i ], expected_host[ i ] ) << "arrays differ at index i = " << i;
+   }
+};
+
+// types for which DistributedScanTest is instantiated
+using DistributedArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   DistributedArray< double, Devices::Sequential, int >,
+   DistributedArray< double, Devices::Host, int >
+#endif
+#ifdef HAVE_CUDA
+   DistributedArray< double, Devices::Cuda, int >
+#endif
+>;
+
+TYPED_TEST_SUITE( DistributedScanTest, DistributedArrayTypes );
+
+// TODO: test that horizontal operations are computed for ghost values without synchronization
+
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInplaceInclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, distributedExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedExclusiveScan( this->a_view, this->b_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( DistributedScanTest, distributedInplaceExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   distributedInplaceExclusiveScan( this->a_view, 0, this->globalSize, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+
+TYPED_TEST( DistributedScanTest, multiplication )
+{
+   this->localRange = Partitioner< typename TestFixture::IndexType >::splitRange( 10, this->group );
+   this->input_host.setDistribution( this->localRange, 0, 10, this->group );
+   this->input_host.setValue( 2 );
+   this->expected_host = this->input_host;
+
+   // exclusive scan test
+   int value = 1;
+   for( int i = 0; i < this->localRange.getEnd(); i++ ) {
+      if( this->localRange.getBegin() <= i )
+         this->expected_host[ i ] = value;
+      value *= 2;
+   }
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] *= 2;
+
+   this->a.reset();
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( DistributedScanTest, custom_begin_end )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   // make it span multiple processes
+   const IndexType begin = 42;
+   const IndexType end = (this->nproc > 1) ? this->globalSize / this->nproc + begin : this->globalSize - begin;
+
+   // exclusive scan test
+   this->input_host.setValue( 1 );
+   this->expected_host.setValue( 1 );
+   int value = 0;
+   for( int i = begin; i < end; i++ ) {
+      if( this->localRange.getBegin() <= i && i < this->localRange.getEnd() )
+         this->expected_host[ i ] = value;
+      value++;
+   }
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a, false );
+
+   // inclusive scan test
+   for( int i = begin; i < end; i++ )
+      if( this->localRange.getBegin() <= i && i < this->localRange.getEnd() )
+         this->expected_host[ i ]++;
+
+   this->a.reset();
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a, false );
+}
+
+TYPED_TEST( DistributedScanTest, empty_range )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   this->localRange = Partitioner< typename TestFixture::IndexType >::splitRange( 42, this->group );
+   this->input_host.setDistribution( this->localRange, 0, 42, this->group );
+   this->input_host.setValue( 1 );
+   this->expected_host = this->input_host;
+
+   const IndexType begin = 2;
+   const IndexType end = 1;
+
+   // exclusive scan test
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedExclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a, false );
+
+   // inclusive scan test
+   this->a.reset();
+   this->a = this->input_host;
+   this->b = this->input_host;
+   distributedInclusiveScan( this->a, this->b, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->b, false );
+   EXPECT_EQ( this->a, this->input_host );
+   distributedInplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a, false );
+}
+
+TYPED_TEST( DistributedScanTest, vector_expression )
+{
+   this->a.setValue( 2 );
+   this->b.setValue( 1 );
+
+   // exclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ] = i;
+
+   this->c.setValue( 0 );
+   distributedExclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->c );
+
+   // inclusive scan test
+   for( int i = this->localRange.getBegin(); i < this->localRange.getEnd(); i++ )
+      this->expected_host[ i ]++;
+
+   this->c.setValue( 0 );
+   distributedInclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->c );
+}
+
+#endif  // HAVE_GTEST
+
+#include "../main_mpi.h"
diff --git a/src/UnitTests/Algorithms/distributedScanTestCuda.cu b/src/UnitTests/Algorithms/distributedScanTestCuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1b60321b48faa922d09cbd2b6af86f09a6e7c92
--- /dev/null
+++ b/src/UnitTests/Algorithms/distributedScanTestCuda.cu
@@ -0,0 +1 @@
+#include "distributedScanTest.h"
diff --git a/src/UnitTests/Algorithms/reduceTest.cpp b/src/UnitTests/Algorithms/reduceTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e9927262e3a6896fc4c59cb15b2513aba6ab401
--- /dev/null
+++ b/src/UnitTests/Algorithms/reduceTest.cpp
@@ -0,0 +1 @@
+#include "reduceTest.h"
diff --git a/src/UnitTests/Algorithms/reduceTest.h b/src/UnitTests/Algorithms/reduceTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..c39215f47a64011ced84242fbff54876fe45c9d2
--- /dev/null
+++ b/src/UnitTests/Algorithms/reduceTest.h
@@ -0,0 +1,282 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Arithmetics/Quad.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/reduce.h>
+#include "../CustomScalar.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::detail;
+
+// test fixture for typed tests
+template< typename Array >
+class ReduceTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+};
+
+// types for which ReduceTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< CustomScalar< int >, Devices::Sequential, long >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< CustomScalar< int >, Devices::Host, int >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< CustomScalar< int >, Devices::Host, long >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Cuda, int >,  // the reduction kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, int >,
+   Array< long,           Devices::Cuda, int >,
+   Array< double,         Devices::Cuda, int >,
+   //Array< Quad< float >,  Devices::Cuda, int >,
+   //Array< Quad< double >, Devices::Cuda, int >,
+   Array< CustomScalar< int >, Devices::Cuda, long >,  // the reduction kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, long >,
+   Array< long,           Devices::Cuda, long >,
+   Array< double,         Devices::Cuda, long >
+   //Array< Quad< float >,  Devices::Cuda, long >,
+   //Array< Quad< double >, Devices::Cuda, long >
+#endif
+>;
+
+TYPED_TEST_SUITE( ReduceTest, ArrayTypes );
+
+template< typename Array >
+void iota( Array& array, typename Array::ValueType start = 0 )
+{
+   array.forAllElements( [start] __cuda_callable__
+                         ( typename Array::IndexType idx, typename Array::ValueType& value )
+                         { value = idx + start; }
+                       );
+}
+
+template< typename Array >
+void mod( Array& array, typename Array::IndexType mod = 0 )
+{
+   array.forAllElements( [mod] __cuda_callable__
+                         ( typename Array::IndexType idx, typename Array::ValueType& value )
+                         { value = idx % mod; }
+                       );
+}
+
+TYPED_TEST( ReduceTest, sum )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.setValue( 1 );
+
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Plus{} );
+      EXPECT_EQ( res, size );
+
+      res = reduce( a, TNL::Plus{} );
+      EXPECT_EQ( res, size );
+   }
+
+   const int size = 9377;
+   a.setSize( size );
+   iota( a );
+   auto res = reduce( a, TNL::Plus{} );
+   EXPECT_EQ( res, (size * (size - 1)) / 2 );
+}
+
+TYPED_TEST( ReduceTest, product )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   a.setSize( 10 );
+   a.setValue( 2 );
+
+   int result = 1;
+   for( int size = 0; size < a.getSize(); size++ )
+   {
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Multiplies{} );
+      EXPECT_EQ( res, result );
+      result *= 2;
+   }
+}
+
+TYPED_TEST( ReduceTest, min )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      iota( a, 1 );
+
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Min{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+TYPED_TEST( ReduceTest, max )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      iota( a, 1 );
+
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::Max{} );
+      EXPECT_EQ( res, size );
+   }
+}
+
+TYPED_TEST( ReduceTest, minWithArg )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      iota( a, 1 );
+
+      auto res = reduceWithArgument< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::MinWithArg{} );
+      EXPECT_EQ( res.first, 1 );
+      EXPECT_EQ( res.second, 0 );
+   }
+}
+
+TYPED_TEST( ReduceTest, maxWithArg )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      iota( a, 1 );
+
+      auto res = reduceWithArgument< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::MaxWithArg{} );
+      EXPECT_EQ( res.first, size );
+      EXPECT_EQ( res.second, size - 1 );
+   }
+}
+
+TYPED_TEST( ReduceTest, logicalAnd )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+
+      mod( a, 2 );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalAnd{} );
+      EXPECT_EQ( res, false );
+
+      a.setValue( 1 );
+      res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalAnd{} );
+      EXPECT_EQ( res, true );
+   }
+}
+
+TYPED_TEST( ReduceTest, logicalOr )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+
+      mod( a, 2 );
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalOr{} );
+      EXPECT_EQ( res, true );
+
+      a.setValue( 0 );
+      res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::LogicalOr{} );
+      EXPECT_EQ( res, false );
+   }
+}
+
+// bitwise AND (&) is not defined for floating-point types
+template< typename ArrayType >
+std::enable_if_t< std::is_integral< typename ArrayType::ValueType >::value >
+test_bitAnd( ArrayType& a )
+{
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( typename ArrayType::IndexType idx, typename ArrayType::ValueType& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
+
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::BitAnd{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+template< typename ArrayType >
+std::enable_if_t< ! std::is_integral< typename ArrayType::ValueType >::value >
+test_bitAnd( ArrayType& a )
+{
+}
+
+TYPED_TEST( ReduceTest, bitAnd )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   test_bitAnd( a );
+}
+
+// bitwise OR (|) is not defined for floating-point types
+template< typename ArrayType >
+std::enable_if_t< std::is_integral< typename ArrayType::ValueType >::value >
+test_bitOr( ArrayType& a )
+{
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( typename ArrayType::IndexType idx, typename ArrayType::ValueType& value ) { value = 1 << ( idx % 8 );} );
+
+      auto res = reduce< typename ArrayType::DeviceType >( 0, size, a.getConstView(), TNL::BitOr{} );
+      EXPECT_EQ( res, 255 );
+   }
+}
+
+template< typename ArrayType >
+std::enable_if_t< ! std::is_integral< typename ArrayType::ValueType >::value >
+test_bitOr( ArrayType& a )
+{
+}
+
+TYPED_TEST( ReduceTest, bitOr )
+{
+   using ArrayType = typename TestFixture::ArrayType;
+   ArrayType a;
+   test_bitOr( a );
+}
+
+#endif
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/reduceTestCuda.cu b/src/UnitTests/Algorithms/reduceTestCuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e9927262e3a6896fc4c59cb15b2513aba6ab401
--- /dev/null
+++ b/src/UnitTests/Algorithms/reduceTestCuda.cu
@@ -0,0 +1 @@
+#include "reduceTest.h"
diff --git a/src/UnitTests/Algorithms/scanTest.cpp b/src/UnitTests/Algorithms/scanTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17cdb1d80b1b7077c952c5ef0b92fcc85f574b31
--- /dev/null
+++ b/src/UnitTests/Algorithms/scanTest.cpp
@@ -0,0 +1 @@
+#include "scanTest.h"
diff --git a/src/UnitTests/Algorithms/scanTest.h b/src/UnitTests/Algorithms/scanTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..9611d7acd66c9b02c86bda2d4f91747a2f9f5687
--- /dev/null
+++ b/src/UnitTests/Algorithms/scanTest.h
@@ -0,0 +1,804 @@
+#pragma once
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include <TNL/Arithmetics/Quad.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Containers/VectorView.h>
+#include <TNL/Algorithms/scan.h>
+#include "../CustomScalar.h"
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Arithmetics;
+using namespace TNL::Algorithms;
+using namespace TNL::Algorithms::detail;
+
+// test fixture for typed tests
+template< typename Array >
+class ScanTest : public ::testing::Test
+{
+protected:
+   using ArrayType = Array;
+   using ValueType = typename ArrayType::ValueType;
+   using DeviceType = typename ArrayType::DeviceType;
+   using IndexType = typename ArrayType::IndexType;
+   using ArrayView = Containers::ArrayView< ValueType, DeviceType, IndexType >;
+   using VectorView = Containers::VectorView< ValueType, DeviceType, IndexType >;
+   using HostArrayType = typename ArrayType::template Self< ValueType, Devices::Sequential >;
+
+   ArrayType a, b, c;
+   ArrayView a_view, b_view, c_view;
+   VectorView av_view, bv_view, cv_view;
+   HostArrayType array_host, input_host, expected_host;
+
+   // should be small enough to have fast tests, but larger than minGPUReductionDataSize
+   // and large enough to require multiple CUDA blocks for reduction
+   // also should be a prime number to cause non-uniform distribution of the work
+   const int size = 9377;
+
+   ScanTest()
+   {
+      resetWorkingArrays();
+      input_host = expected_host = a;
+   }
+
+   void resetWorkingArrays()
+   {
+      a.setSize( size );
+      a.setValue( -1 );
+      c = b = a;
+      a_view.bind( a );
+      b_view.bind( b );
+      c_view.bind( c );
+      av_view.bind( a );
+      bv_view.bind( b );
+      cv_view.bind( c );
+
+      // make sure that we perform tests with multiple CUDA grids
+#ifdef HAVE_CUDA
+      if( std::is_same< DeviceType, Devices::Cuda >::value )
+      {
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
+         CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
+      }
+#endif
+   }
+
+   template< Algorithms::detail::ScanType ScanType >
+   void checkResult( const ArrayType& array )
+   {
+#ifdef HAVE_CUDA
+      // skip the check for too small arrays
+      if( array.getSize() > 256 ) {
+         // we don't care which kernel launcher was actually used
+         const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
+                                           CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
+         EXPECT_GT( gridsCount, 1 );
+      }
+#endif
+
+      array_host = array;
+
+      for( int i = 0; i < array.getSize(); i++ )
+         EXPECT_EQ( array_host[ i ], expected_host[ i ] ) << "arrays differ at index i = " << i;
+   }
+};
+
+// types for which ScanTest is instantiated
+// TODO: Quad must be fixed
+using ArrayTypes = ::testing::Types<
+#ifndef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Sequential, short >,
+   Array< int,            Devices::Sequential, short >,
+   Array< long,           Devices::Sequential, short >,
+   Array< double,         Devices::Sequential, short >,
+   //Array< Quad< float >,  Devices::Sequential, short >,
+   //Array< Quad< double >, Devices::Sequential, short >,
+   Array< CustomScalar< int >, Devices::Sequential, int >,
+   Array< int,            Devices::Sequential, int >,
+   Array< long,           Devices::Sequential, int >,
+   Array< double,         Devices::Sequential, int >,
+   //Array< Quad< float >,  Devices::Sequential, int >,
+   //Array< Quad< double >, Devices::Sequential, int >,
+   Array< CustomScalar< int >, Devices::Sequential, long >,
+   Array< int,            Devices::Sequential, long >,
+   Array< long,           Devices::Sequential, long >,
+   Array< double,         Devices::Sequential, long >,
+   //Array< Quad< float >,  Devices::Sequential, long >,
+   //Array< Quad< double >, Devices::Sequential, long >,
+
+   Array< CustomScalar< int >, Devices::Host, short >,
+   Array< int,            Devices::Host, short >,
+   Array< long,           Devices::Host, short >,
+   Array< double,         Devices::Host, short >,
+   //Array< Quad< float >,  Devices::Host, short >,
+   //Array< Quad< double >, Devices::Host, short >,
+   Array< CustomScalar< int >, Devices::Host, int >,
+   Array< int,            Devices::Host, int >,
+   Array< long,           Devices::Host, int >,
+   Array< double,         Devices::Host, int >,
+   //Array< Quad< float >,  Devices::Host, int >,
+   //Array< Quad< double >, Devices::Host, int >,
+   Array< CustomScalar< int >, Devices::Host, long >,
+   Array< int,            Devices::Host, long >,
+   Array< long,           Devices::Host, long >,
+   Array< double,         Devices::Host, long >
+   //Array< Quad< float >,  Devices::Host, long >,
+   //Array< Quad< double >, Devices::Host, long >
+#endif
+#ifdef HAVE_CUDA
+   Array< CustomScalar< int >, Devices::Cuda, short >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, short >,
+   Array< long,           Devices::Cuda, short >,
+   Array< double,         Devices::Cuda, short >,
+   //Array< Quad< float >,  Devices::Cuda, short >,
+   //Array< Quad< double >, Devices::Cuda, short >,
+   Array< CustomScalar< int >, Devices::Cuda, int >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, int >,
+   Array< long,           Devices::Cuda, int >,
+   Array< double,         Devices::Cuda, int >,
+   //Array< Quad< float >,  Devices::Cuda, int >,
+   //Array< Quad< double >, Devices::Cuda, int >,
+   Array< CustomScalar< int >, Devices::Cuda, long >,  // the scan kernel for CustomScalar is not specialized with __shfl instructions
+   Array< int,            Devices::Cuda, long >,
+   Array< long,           Devices::Cuda, long >,
+   Array< double,         Devices::Cuda, long >
+   //Array< Quad< float >,  Devices::Cuda, long >,
+   //Array< Quad< double >, Devices::Cuda, long >
+#endif
+>;
+
+TYPED_TEST_SUITE( ScanTest, ArrayTypes );
+
+TYPED_TEST( ScanTest, inclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceInclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceInclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i + 1;
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, inclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceInclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i + 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inplaceInclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, exclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_zero_array )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 0 );
+   this->expected_host.setValue( 0 );
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, exclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   // overload with default reduction operation and default end and outputBegin, array view
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin, end and outputBegin, array
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_constant_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   this->input_host.setValue( 1 );
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   // general overload, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // general overload, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with TNL functional, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default end, array view
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   // overload with default reduction operation and default begin and end, array
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, exclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   exclusiveScan( this->a_view, this->b_view, 0, this->size, 0, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+}
+
+TYPED_TEST( ScanTest, inplaceExclusiveScan_linear_sequence )
+{
+   using ValueType = typename TestFixture::ValueType;
+
+   for( int i = 0; i < this->size; i++ ) {
+      this->input_host[ i ] = i;
+      this->expected_host[ i ] = (i * (i - 1)) / 2;
+   }
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   this->resetWorkingArrays();
+
+   this->a = this->input_host;
+   inplaceExclusiveScan( this->a_view, 0, this->size, std::plus<>{}, (ValueType) 0 );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+}
+
+
+TYPED_TEST( ScanTest, multiplication )
+{
+   this->input_host.setSize( 10 );
+   this->input_host.setValue( 2 );
+   this->expected_host = this->input_host;
+
+   // exclusive scan test
+   int value = 1;
+   for( int i = 0; i < this->expected_host.getSize(); i++ ) {
+      this->expected_host[ i ] = value;
+      value *= 2;
+   }
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, 0, this->a.getSize(), 0, TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceExclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = 0; i < this->expected_host.getSize(); i++ )
+      this->expected_host[ i ] *= 2;
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, 0, this->a.getSize(), 0, TNL::Multiplies{} );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceInclusiveScan( this->a, 0, this->a.getSize(), TNL::Multiplies{} );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, custom_begin_end )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   const IndexType begin = 42;
+   const IndexType end = this->size - begin;
+
+   // exclusive scan test
+   this->input_host.setValue( 1 );
+   this->expected_host.setValue( 1 );
+   for( int i = begin; i < end; i++ )
+      this->expected_host[ i ] = i - begin;
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, begin, end, begin );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   for( int i = begin; i < end; i++ )
+      this->expected_host[ i ]++;
+
+   this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, begin, end, begin );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, empty_range )
+{
+   using IndexType = typename TestFixture::IndexType;
+
+   this->input_host.setSize( 42 );
+   this->input_host.setValue( 1 );
+   this->expected_host = this->input_host;
+
+   const IndexType begin = 2;
+   const IndexType end = 1;
+
+   // exclusive scan test
+   this->a = this->input_host;
+   this->b = this->input_host;
+   exclusiveScan( this->a, this->b, begin, end, 0 );
+   this->template checkResult< ScanType::Exclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceExclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Exclusive >( this->a );
+
+   // inclusive scan test
+   this->a = this->input_host;
+   this->b = this->input_host;
+   inclusiveScan( this->a, this->b, begin, end, 0 );
+   this->template checkResult< ScanType::Inclusive >( this->b );
+   EXPECT_EQ( this->a, this->input_host );
+   inplaceInclusiveScan( this->a, begin, end );
+   this->template checkResult< ScanType::Inclusive >( this->a );
+}
+
+TYPED_TEST( ScanTest, vector_expression )
+{
+   this->a.setValue( 2 );
+   this->b.setValue( 1 );
+
+   // exclusive scan test
+   for( int i = 0; i < this->size; i++ )
+      this->expected_host[ i ] = i;
+
+   this->c.setValue( 0 );
+   exclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Exclusive >( this->c );
+
+   // inclusive scan test
+   for( int i = 0; i < this->expected_host.getSize(); i++ )
+      this->expected_host[ i ]++;
+
+   this->c.setValue( 0 );
+   inclusiveScan( this->av_view - this->bv_view, this->c, 0, this->a.getSize(), 0, TNL::Plus{} );
+   this->template checkResult< ScanType::Inclusive >( this->c );
+}
+
+#endif // HAVE_GTEST
+
+#include "../main.h"
diff --git a/src/UnitTests/Algorithms/scanTestCuda.cu b/src/UnitTests/Algorithms/scanTestCuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..17cdb1d80b1b7077c952c5ef0b92fcc85f574b31
--- /dev/null
+++ b/src/UnitTests/Algorithms/scanTestCuda.cu
@@ -0,0 +1 @@
+#include "scanTest.h"
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index d50b682ba8a2284664c63a0c98767ae3faac44aa..03c521cec93ee4de856b9d7e8cd9ff35b0957f85 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -8,6 +8,7 @@ ADD_SUBDIRECTORY( Pointers )
 
 set( CPP_TESTS  AssertTest
                 base64Test
+                CustomScalarTest
                 FileNameTest
                 MathTest
                 ObjectTest
diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h
index 1ed8052eef47445e42fa34135c5a83cc431754ea..78bd388a4515f45040e1e41adef5e95cd06db843 100644
--- a/src/UnitTests/Containers/ArrayTest.h
+++ b/src/UnitTests/Containers/ArrayTest.h
@@ -16,9 +16,6 @@
 #include <TNL/Containers/Array.h>
 #include <TNL/Containers/Vector.h>
 #include <TNL/Pointers/DevicePointer.h>
-#include <TNL/Pointers/SharedPointer.h>
-#include <TNL/Pointers/SmartPointersRegister.h>
-#include <TNL/Algorithms/ParallelFor.h>
 
 #include "gtest/gtest.h"
 
@@ -67,6 +64,23 @@ protected:
 // types for which ArrayTest is instantiated
 using ArrayTypes = ::testing::Types<
 #ifndef HAVE_CUDA
+   // we can't test all types because the argument list would be too long...
+//   Array< int,    Devices::Sequential, short >,
+//   Array< long,   Devices::Sequential, short >,
+//   Array< float,  Devices::Sequential, short >,
+//   Array< double, Devices::Sequential, short >,
+//   Array< MyData, Devices::Sequential, short >,
+//   Array< int,    Devices::Sequential, int >,
+//   Array< long,   Devices::Sequential, int >,
+//   Array< float,  Devices::Sequential, int >,
+//   Array< double, Devices::Sequential, int >,
+//   Array< MyData, Devices::Sequential, int >,
+   Array< int,    Devices::Sequential, long >,
+   Array< long,   Devices::Sequential, long >,
+   Array< float,  Devices::Sequential, long >,
+   Array< double, Devices::Sequential, long >,
+   Array< MyData, Devices::Sequential, long >,
+
    Array< int,    Devices::Host, short >,
    Array< long,   Devices::Host, short >,
    Array< float,  Devices::Host, short >,
@@ -105,6 +119,8 @@ using ArrayTypes = ::testing::Types<
    // (but we can't test all types because the argument list would be too long...)
 #ifndef HAVE_CUDA
    ,
+   Vector< float,  Devices::Sequential, long >,
+   Vector< double, Devices::Sequential, long >,
    Vector< float,  Devices::Host, long >,
    Vector< double, Devices::Host, long >
 #endif
@@ -361,6 +377,19 @@ TYPED_TEST( ArrayTest, reset )
    EXPECT_EQ( u.getData(), nullptr );
 }
 
+template< typename Value, typename Index >
+void testArrayElementwiseAccess( Array< Value, Devices::Sequential, Index >&& u )
+{
+   u.setSize( 10 );
+   for( int i = 0; i < 10; i++ ) {
+      u.setElement( i, i );
+      EXPECT_EQ( u.getData()[ i ], i );
+      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
+   }
+}
+
 template< typename Value, typename Index >
 void testArrayElementwiseAccess( Array< Value, Devices::Host, Index >&& u )
 {
@@ -370,15 +399,17 @@ void testArrayElementwiseAccess( Array< Value, Devices::Host, Index >&& u )
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
 #ifdef HAVE_CUDA
 template< typename ValueType, typename IndexType >
-__global__ void testSetGetElementKernel( Array< ValueType, Devices::Cuda, IndexType >* u )
+__global__ void testSetGetElementKernel( Array< ValueType, Devices::Cuda, IndexType >* u,
+                                         Array< ValueType, Devices::Cuda, IndexType >* v )
 {
-   if( threadIdx.x < ( *u ).getSize() )
-      ( *u )[ threadIdx.x ] = threadIdx.x;
+   if( threadIdx.x < u->getSize() )
+      ( *u )[ threadIdx.x ] = ( *v )( threadIdx.x ) = threadIdx.x;
 }
 #endif /* HAVE_CUDA */
 
@@ -386,14 +417,16 @@ template< typename Value, typename Index >
 void testArrayElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u )
 {
 #ifdef HAVE_CUDA
-   u.setSize( 10 );
    using ArrayType = Array< Value, Devices::Cuda, Index >;
-   Pointers::DevicePointer< ArrayType > kernel_u( u );
-   testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >() );
+   u.setSize( 10 );
+   ArrayType v( 10 );
+   Pointers::DevicePointer< ArrayType > kernel_u( u ), kernel_v( v );
+   testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >(), &kernel_v.template modifyData< Devices::Cuda >() );
    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    for( int i = 0; i < 10; i++ ) {
       EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( v.getElement( i ), i );
    }
 #endif
 }
@@ -405,22 +438,46 @@ TYPED_TEST( ArrayTest, elementwiseAccess )
    testArrayElementwiseAccess( ArrayType() );
 }
 
-template< typename ArrayType >
-void test_setElement()
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Sequential, Index >& )
 {
-   Pointers::SharedPointer< ArrayType > a( 10, 0 ), b( 10, 0 );
-   auto set = [=] __cuda_callable__ ( int i ) mutable {
-      a->setElement( i, i );
-      b->setElement( i, a->getElement( i ) );
-   };
-   Pointers::synchronizeSmartPointersOnDevice< typename ArrayType::DeviceType >();
-   Algorithms::ParallelFor< typename ArrayType::DeviceType >::exec( 0, 10, set );
-   for( int i = 0; i < 10; i++ )
-   {
-      EXPECT_EQ( a->getElement( i ), i );
-      EXPECT_EQ( b->getElement( i ), i );
+}
+
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Host, Index >& )
+{
+}
+
+#ifdef HAVE_CUDA
+template< typename ValueType, typename IndexType >
+__global__ void test_setElement_on_device_kernel( Array< ValueType, Devices::Cuda, IndexType >* a,
+                                                  Array< ValueType, Devices::Cuda, IndexType >* b )
+{
+   if( threadIdx.x < a->getSize() ) {
+      a->setElement( threadIdx.x, threadIdx.x );
+      b->setElement( threadIdx.x, a->getElement( threadIdx.x ) );
    }
 }
+#endif /* HAVE_CUDA */
+
+template< typename Value, typename Index >
+void test_setElement_on_device( const Array< Value, Devices::Cuda, Index >& )
+{
+#ifdef HAVE_CUDA
+   using ArrayType = Array< Value, Devices::Cuda, Index >;
+   ArrayType a( 10, 0 ), b( 10, 0 );
+   Pointers::DevicePointer< ArrayType > kernel_a( a );
+   Pointers::DevicePointer< ArrayType > kernel_b( b );
+   test_setElement_on_device_kernel<<< 1, 16 >>>( &kernel_a.template modifyData< Devices::Cuda >(),
+                                                  &kernel_b.template modifyData< Devices::Cuda >() );
+   cudaDeviceSynchronize();
+   TNL_CHECK_CUDA_DEVICE;
+   for( int i = 0; i < 10; i++ ) {
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
+   }
+#endif
+}
 
 TYPED_TEST( ArrayTest, setElement )
 {
@@ -433,7 +490,7 @@ TYPED_TEST( ArrayTest, setElement )
    for( int i = 0; i < 10; i++ )
       EXPECT_EQ( a.getElement( i ), i );
 
-   test_setElement< ArrayType >();
+   test_setElement_on_device( a );
 }
 
 // test must be in a plain function because nvcc sucks (extended lambdas are
@@ -455,40 +512,6 @@ TYPED_TEST( ArrayTest, forElements )
    testArrayForEachElement< typename TestFixture::ArrayType >();
 }
 
-TYPED_TEST( ArrayTest, containsValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-
-   ArrayType array;
-   array.setSize( 1024 );
-
-   for( int i = 0; i < array.getSize(); i++ )
-      array.setElement( i, i % 10 );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( array.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( array.containsValue( i ) );
-}
-
-TYPED_TEST( ArrayTest, containsOnlyValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-
-   ArrayType array;
-   array.setSize( 1024 );
-
-   for( int i = 0; i < array.getSize(); i++ )
-      array.setElement( i, i % 10 );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( array.containsOnlyValue( i ) );
-
-   array.setValue( 100 );
-   EXPECT_TRUE( array.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( ArrayTest, comparisonOperator )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/ArrayViewTest.h b/src/UnitTests/Containers/ArrayViewTest.h
index d620b8bbb1ced1446f05e99fd1992c9609eb777b..93c593bd39dac0f4731ef902bfe119e8fb42d07c 100644
--- a/src/UnitTests/Containers/ArrayViewTest.h
+++ b/src/UnitTests/Containers/ArrayViewTest.h
@@ -60,7 +60,24 @@ protected:
 // types for which ArrayViewTest is instantiated
 using ViewTypes = ::testing::Types<
 #ifndef HAVE_CUDA
-    ArrayView< int,    Devices::Host, short >
+   // we can't test all types because the argument list would be too long...
+//    ArrayView< int,    Devices::Sequential, short >
+//   ,ArrayView< long,   Devices::Sequential, short >
+//   ,ArrayView< float,  Devices::Sequential, short >
+//   ,ArrayView< double, Devices::Sequential, short >
+//   ,ArrayView< MyData, Devices::Sequential, short >
+//   ,ArrayView< int,    Devices::Sequential, int >
+//   ,ArrayView< long,   Devices::Sequential, int >
+//   ,ArrayView< float,  Devices::Sequential, int >
+//   ,ArrayView< double, Devices::Sequential, int >
+//   ,ArrayView< MyData, Devices::Sequential, int >
+    ArrayView< int,    Devices::Sequential, long >
+   ,ArrayView< long,   Devices::Sequential, long >
+   ,ArrayView< float,  Devices::Sequential, long >
+   ,ArrayView< double, Devices::Sequential, long >
+   ,ArrayView< MyData, Devices::Sequential, long >
+
+   ,ArrayView< int,    Devices::Host, short >
    ,ArrayView< long,   Devices::Host, short >
    ,ArrayView< float,  Devices::Host, short >
    ,ArrayView< double, Devices::Host, short >
@@ -98,6 +115,8 @@ using ViewTypes = ::testing::Types<
    // (but we can't test all types because the argument list would be too long...)
 #ifndef HAVE_CUDA
    ,
+   VectorView< float,  Devices::Sequential, long >,
+   VectorView< double, Devices::Sequential, long >,
    VectorView< float,  Devices::Host, long >,
    VectorView< double, Devices::Host, long >
 #endif
@@ -218,6 +237,21 @@ TYPED_TEST( ArrayViewTest, reset )
    EXPECT_EQ( u.getData(), nullptr );
 }
 
+template< typename Value, typename Index >
+void testArrayViewElementwiseAccess( Array< Value, Devices::Sequential, Index >&& a )
+{
+   a.setSize( 10 );
+   using ViewType = ArrayView< Value, Devices::Sequential, Index >;
+   ViewType u( a );
+   for( int i = 0; i < 10; i++ ) {
+      u.setElement( i, i );
+      EXPECT_EQ( u.getData()[ i ], i );
+      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
+   }
+}
+
 template< typename Value, typename Index >
 void testArrayViewElementwiseAccess( Array< Value, Devices::Host, Index >&& a )
 {
@@ -229,30 +263,34 @@ void testArrayViewElementwiseAccess( Array< Value, Devices::Host, Index >&& a )
       EXPECT_EQ( u.getData()[ i ], i );
       EXPECT_EQ( u.getElement( i ), i );
       EXPECT_EQ( u[ i ], i );
+      EXPECT_EQ( u( i ), i );
    }
 }
 
 #ifdef HAVE_CUDA
 template< typename ValueType, typename IndexType >
-__global__ void testSetGetElementKernel( ArrayView< ValueType, Devices::Cuda, IndexType > v )
+__global__ void testSetGetElementKernel( ArrayView< ValueType, Devices::Cuda, IndexType > u,
+                                         ArrayView< ValueType, Devices::Cuda, IndexType > v )
 {
    if( threadIdx.x < v.getSize() )
-      v[ threadIdx.x ] = threadIdx.x;
+      u[ threadIdx.x ] = v( threadIdx.x ) = threadIdx.x;
 }
 #endif // HAVE_CUDA
 
 template< typename Value, typename Index >
-void testArrayViewElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u )
+void testArrayViewElementwiseAccess( Array< Value, Devices::Cuda, Index >&& a )
 {
 #ifdef HAVE_CUDA
-   u.setSize( 10 );
    using ArrayType = Array< Value, Devices::Cuda, Index >;
    using ViewType = ArrayView< Value, Devices::Cuda, Index >;
-   ViewType v( u );
-   testSetGetElementKernel<<< 1, 16 >>>( v );
+   a.setSize( 10 );
+   ArrayType b( 10 );
+   ViewType u( a ), v( b );
+   testSetGetElementKernel<<< 1, 16 >>>( u, v );
    TNL_CHECK_CUDA_DEVICE;
    for( int i = 0; i < 10; i++ ) {
-      EXPECT_EQ( u.getElement( i ), i );
+      EXPECT_EQ( a.getElement( i ), i );
+      EXPECT_EQ( b.getElement( i ), i );
    }
 #endif
 }
@@ -274,7 +312,7 @@ void ArrayViewEvaluateTest( ArrayType& u )
    ViewType v( u );
 
    v.forAllElements( [] __cuda_callable__ ( IndexType i, ValueType& value ) { value = 3 * i % 4; } );
-   
+
    for( int i = 0; i < 10; i++ )
    {
       EXPECT_EQ( u.getElement( i ), 3 * i % 4 );
@@ -323,44 +361,6 @@ TYPED_TEST( ArrayViewTest, evaluate )
    ArrayViewEvaluateTest( u );
 }
 
-TYPED_TEST( ArrayViewTest, containsValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-
-   ArrayType a;
-   a.setSize( 1024 );
-   ViewType v = a.getView();
-
-   for( int i = 0; i < v.getSize(); i++ )
-      v.setElement( i, i % 10 );
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( v.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( v.containsValue( i ) );
-}
-
-TYPED_TEST( ArrayViewTest, containsOnlyValue )
-{
-   using ArrayType = typename TestFixture::ArrayType;
-   using ViewType = typename TestFixture::ViewType;
-
-   ArrayType a;
-   a.setSize( 1024 );
-   ViewType v = a.getView();
-
-   for( int i = 0; i < v.getSize(); i++ )
-      v.setElement( i, i % 10 );
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( v.containsOnlyValue( i ) );
-
-   a.setValue( 100 );
-   EXPECT_TRUE( v.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( ArrayViewTest, comparisonOperator )
 {
    using ArrayType = typename TestFixture::ArrayType;
diff --git a/src/UnitTests/Containers/CMakeLists.txt b/src/UnitTests/Containers/CMakeLists.txt
index 9d9e413432d33aad501c184bbf24bfae2bc41da6..4c39452022ebe26856b70e25e088ea0207cabb6d 100644
--- a/src/UnitTests/Containers/CMakeLists.txt
+++ b/src/UnitTests/Containers/CMakeLists.txt
@@ -8,7 +8,6 @@ set( CPP_TESTS
          StaticVectorOperationsTest
          StaticVectorOfStaticVectorsTest
          VectorTest
-         VectorPrefixSumTest
          VectorEvaluateAndReduceTest
          VectorBinaryOperationsTest
          VectorUnaryOperationsTest
@@ -19,7 +18,6 @@ set( CUDA_TESTS
          ArrayTestCuda
          ArrayViewTestCuda
          VectorTestCuda
-         VectorPrefixSumTestCuda
          VectorEvaluateAndReduceTestCuda
          VectorBinaryOperationsTestCuda
          VectorUnaryOperationsTestCuda
@@ -56,16 +54,6 @@ if( ${BUILD_MPI} )
       TARGET_LINK_LIBRARIES( DistributedArrayTest ${GTEST_BOTH_LIBRARIES} )
    endif()
 
-   ADD_EXECUTABLE( DistributedVectorTest DistributedVectorTest.cpp )
-   TARGET_COMPILE_OPTIONS( DistributedVectorTest PRIVATE ${CXX_TESTS_FLAGS} )
-   TARGET_LINK_LIBRARIES( DistributedVectorTest ${GTEST_BOTH_LIBRARIES} )
-
-   if( BUILD_CUDA )
-      CUDA_ADD_EXECUTABLE( DistributedVectorTestCuda DistributedVectorTestCuda.cu
-                           OPTIONS ${CXX_TESTS_FLAGS} )
-      TARGET_LINK_LIBRARIES( DistributedVectorTestCuda ${GTEST_BOTH_LIBRARIES} )
-   endif()
-
    ADD_EXECUTABLE( DistributedVectorBinaryOperationsTest DistributedVectorBinaryOperationsTest.cpp )
    TARGET_COMPILE_OPTIONS( DistributedVectorBinaryOperationsTest PRIVATE ${CXX_TESTS_FLAGS} )
    TARGET_LINK_LIBRARIES( DistributedVectorBinaryOperationsTest ${GTEST_BOTH_LIBRARIES} )
@@ -93,10 +81,6 @@ if( ${BUILD_MPI} )
    ADD_TEST( NAME DistributedArrayTest COMMAND "mpirun" ${mpi_test_parameters})
    ADD_TEST( NAME DistributedArrayTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
-   SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-   ADD_TEST( NAME DistributedVectorTest COMMAND "mpirun" ${mpi_test_parameters})
-   ADD_TEST( NAME DistributedVectorTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTest${CMAKE_EXECUTABLE_SUFFIX}" )
-
    SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest COMMAND "mpirun" ${mpi_test_parameters})
    ADD_TEST( NAME DistributedVectorBinaryOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
@@ -110,10 +94,6 @@ if( ${BUILD_MPI} )
    ADD_TEST( NAME DistributedVectorVerticalOperationsTest_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorVerticalOperationsTest${CMAKE_EXECUTABLE_SUFFIX}" )
 
    if( BUILD_CUDA )
-      SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
-      ADD_TEST( NAME DistributedVectorTestCuda COMMAND "mpirun" ${mpi_test_parameters})
-      ADD_TEST( NAME DistributedVectorTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
-
       SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda COMMAND "mpirun" ${mpi_test_parameters})
       ADD_TEST( NAME DistributedVectorBinaryOperationsTestCuda_nodistr COMMAND "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedVectorBinaryOperationsTestCuda${CMAKE_EXECUTABLE_SUFFIX}" )
diff --git a/src/UnitTests/Containers/DistributedArrayTest.h b/src/UnitTests/Containers/DistributedArrayTest.h
index e25739afe1b8fb6608e5d319953f4d072f77f875..bc0edb4453ff0519205854a3af01319252d4668d 100644
--- a/src/UnitTests/Containers/DistributedArrayTest.h
+++ b/src/UnitTests/Containers/DistributedArrayTest.h
@@ -291,43 +291,6 @@ TYPED_TEST( DistributedArrayTest, comparisonOperators )
    EXPECT_TRUE( u == v );
 }
 
-TYPED_TEST( DistributedArrayTest, containsValue )
-{
-   using IndexType = typename TestFixture::IndexType;
-
-   const auto localRange = this->distributedArray.getLocalRange();
-
-   for( int i = 0; i < localRange.getSize(); i++ ) {
-      const IndexType gi = localRange.getGlobalIndex( i );
-      this->distributedArray.setElement( gi, i % 10 );
-   }
-
-   for( int i = 0; i < 10; i++ )
-      EXPECT_TRUE( this->distributedArray.containsValue( i ) );
-
-   for( int i = 10; i < 20; i++ )
-      EXPECT_FALSE( this->distributedArray.containsValue( i ) );
-}
-
-TYPED_TEST( DistributedArrayTest, containsOnlyValue )
-{
-   using IndexType = typename TestFixture::IndexType;
-
-   const auto localRange = this->distributedArray.getLocalRange();
-
-   for( int i = 0; i < localRange.getSize(); i++ ) {
-      const IndexType gi = localRange.getGlobalIndex( i );
-      this->distributedArray.setElement( gi, i % 10 );
-   }
-
-   for( int i = 0; i < 20; i++ )
-      EXPECT_FALSE( this->distributedArray.containsOnlyValue( i ) );
-
-   this->distributedArray.setValue( 100 );
-   this->distributedArray.waitForSynchronization();
-   EXPECT_TRUE( this->distributedArray.containsOnlyValue( 100 ) );
-}
-
 TYPED_TEST( DistributedArrayTest, empty )
 {
    EXPECT_GT( this->distributedArray.getSize(), 0 );
diff --git a/src/UnitTests/Containers/DistributedVectorTest.cpp b/src/UnitTests/Containers/DistributedVectorTest.cpp
deleted file mode 100644
index 5b0c61c85fc3ad450d3b838b53156da90c38e7d6..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/DistributedVectorTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedVectorTest.h"
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
deleted file mode 100644
index 8dc9d6d26c2553039ba34474d4adb6a990cc8bd5..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/***************************************************************************
-                          DistributedVectorTest.h  -  description
-                             -------------------
-    begin                : Sep 6, 2018
-    copyright            : (C) 2018 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-#ifdef HAVE_GTEST
-#include <limits>
-
-#include <gtest/gtest.h>
-
-#include <TNL/Containers/DistributedVector.h>
-#include <TNL/Containers/DistributedVectorView.h>
-#include <TNL/Containers/Partitioner.h>
-
-#define DISTRIBUTED_VECTOR
-#include "VectorHelperFunctions.h"
-
-using namespace TNL;
-using namespace TNL::Containers;
-using namespace TNL::MPI;
-
-/*
- * Light check of DistributedVector.
- *
- * - Number of processes is not limited.
- * - Global size is hardcoded as 97 to force non-uniform distribution.
- * - Communication group is hardcoded as AllGroup -- it may be changed as needed.
- */
-template< typename DistributedVector >
-class DistributedVectorTest
-: public ::testing::Test
-{
-protected:
-   using RealType = typename DistributedVector::RealType;
-   using DeviceType = typename DistributedVector::DeviceType;
-   using IndexType = typename DistributedVector::IndexType;
-   using DistributedVectorType = DistributedVector;
-   using VectorViewType = typename DistributedVectorType::LocalViewType;
-   using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType >;
-   using HostDistributedVectorType = typename DistributedVectorType::template Self< RealType, Devices::Sequential >;
-
-   const MPI_Comm group = AllGroup();
-
-   DistributedVectorType v;
-   DistributedVectorView v_view;
-   HostDistributedVectorType v_host;
-
-   const int rank = GetRank(group);
-   const int nproc = GetSize(group);
-
-   // should be small enough to have fast tests, but large enough to test
-   // scan with multiple CUDA grids
-   const int globalSize = 10000 * nproc;
-
-   // some arbitrary value (but must be 0 if not distributed)
-   const int ghosts = (nproc > 1) ? 4 : 0;
-
-   DistributedVectorTest()
-   {
-      using LocalRangeType = typename DistributedVector::LocalRangeType;
-      const LocalRangeType localRange = Partitioner< IndexType >::splitRange( globalSize, group );
-      v.setDistribution( localRange, ghosts, globalSize, group );
-
-      using Synchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< DeviceType >;
-      using HostSynchronizer = typename Partitioner< IndexType >::template ArraySynchronizer< Devices::Sequential >;
-      v.setSynchronizer( std::make_shared<Synchronizer>( localRange, ghosts / 2, group ) );
-      v_view.setSynchronizer( v.getSynchronizer() );
-      v_host.setSynchronizer( std::make_shared<HostSynchronizer>( localRange, ghosts / 2, group ) );
-
-      v_view.bind( v );
-      setConstantSequence( v, 1 );
-   }
-};
-
-// types for which DistributedVectorTest is instantiated
-using DistributedVectorTypes = ::testing::Types<
-   DistributedVector< double, Devices::Host, int >
-#ifdef HAVE_CUDA
-   ,
-   DistributedVector< double, Devices::Cuda, int >
-#endif
->;
-
-TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
-
-// TODO: test that horizontal operations are computed for ghost values without synchronization
-
-TYPED_TEST( DistributedVectorTest, scan )
-{
-   using RealType = typename TestFixture::DistributedVectorType::RealType;
-   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
-   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
-
-   auto& v = this->v;
-   auto& v_view = this->v_view;
-   auto& v_host = this->v_host;
-   const auto localRange = v.getLocalRange();
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.scan();
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-TYPED_TEST( DistributedVectorTest, exclusiveScan )
-{
-   using RealType = typename TestFixture::DistributedVectorType::RealType;
-   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
-   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
-
-   auto& v = this->v;
-   auto& v_view = this->v_view;
-   auto& v_host = this->v_host;
-   const auto localRange = v.getLocalRange();
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v_view;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i );
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], i );
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-#endif  // HAVE_GTEST
-
-#include "../main_mpi.h"
diff --git a/src/UnitTests/Containers/DistributedVectorTestCuda.cu b/src/UnitTests/Containers/DistributedVectorTestCuda.cu
deleted file mode 100644
index 5b0c61c85fc3ad450d3b838b53156da90c38e7d6..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/DistributedVectorTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "DistributedVectorTest.h"
diff --git a/src/UnitTests/Containers/StaticArrayTest.cpp b/src/UnitTests/Containers/StaticArrayTest.cpp
index b22afa798d1280905934485109a890844e5445fe..e491b2021566d4b8f61c2d5bc70c75fe61aea7ab 100644
--- a/src/UnitTests/Containers/StaticArrayTest.cpp
+++ b/src/UnitTests/Containers/StaticArrayTest.cpp
@@ -117,6 +117,8 @@ void checkCoordinates( StaticArray< 1, Value >& u )
    EXPECT_EQ( u.x(), 0 );
    u.x() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
 }
 
 template< typename Value >
@@ -127,7 +129,11 @@ void checkCoordinates( StaticArray< 2, Value >& u )
    u.x() += 1;
    u.y() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
    EXPECT_EQ( u.y(), 2 );
+   EXPECT_EQ( u[ 1 ], 2 );
+   EXPECT_EQ( u( 1 ), 2 );
 }
 
 template< typename Value >
@@ -140,8 +146,14 @@ void checkCoordinates( StaticArray< 3, Value >& u )
    u.y() += 1;
    u.z() += 1;
    EXPECT_EQ( u.x(), 1 );
+   EXPECT_EQ( u[ 0 ], 1 );
+   EXPECT_EQ( u( 0 ), 1 );
    EXPECT_EQ( u.y(), 2 );
+   EXPECT_EQ( u[ 1 ], 2 );
+   EXPECT_EQ( u( 1 ), 2 );
    EXPECT_EQ( u.z(), 3 );
+   EXPECT_EQ( u[ 2 ], 3 );
+   EXPECT_EQ( u( 2 ), 3 );
 }
 
 template< int _size, typename Value >
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index b79b675cf7237950be48d68992f7bcd8c794b01d..341418f85496f861845f80eee3f393baa1c1cdf9 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -163,8 +164,8 @@ protected:
             DistributedVectorView< short, Devices::Host, int > >,
       Pair< DistributedVectorView< int,   Devices::Host, int >,
             DistributedVector<     short, Devices::Host, int > >,
-      Pair< DistributedVectorView< int,   Devices::Host, int >,
-            DistributedVectorView< short, Devices::Host, int > >
+      Pair< DistributedVectorView< CustomScalar< int >,   Devices::Host, int >,
+            DistributedVectorView< CustomScalar< short >, Devices::Host, int > >
    #else
       Pair< DistributedVector<     int,   Devices::Cuda, int >,
             DistributedVector<     short, Devices::Cuda, int > >,
@@ -172,8 +173,8 @@ protected:
             DistributedVectorView< short, Devices::Cuda, int > >,
       Pair< DistributedVectorView< int,   Devices::Cuda, int >,
             DistributedVector<     short, Devices::Cuda, int > >,
-      Pair< DistributedVectorView< int,   Devices::Cuda, int >,
-            DistributedVectorView< short, Devices::Cuda, int > >
+      Pair< DistributedVectorView< CustomScalar< int >,   Devices::Cuda, int >,
+            DistributedVectorView< CustomScalar< short >, Devices::Cuda, int > >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -183,20 +184,21 @@ protected:
          Pair< StaticVector< 2, StaticVector< 3, int > >,  StaticVector< 2, StaticVector< 3, short > > >,
          Pair< StaticVector< 3, StaticVector< 3, int > >,  StaticVector< 3, StaticVector< 3, short > > >,
          Pair< StaticVector< 4, StaticVector< 3, int > >,  StaticVector< 4, StaticVector< 3, short > > >,
-         Pair< StaticVector< 5, StaticVector< 3, int > >,  StaticVector< 5, StaticVector< 3, short > > >
+         Pair< StaticVector< 5, StaticVector< 3, int > >,  StaticVector< 5, StaticVector< 3, short > > >,
+         Pair< StaticVector< 5, StaticVector< 3, CustomScalar< int > > >,  StaticVector< 5, StaticVector< 3, CustomScalar< short > > > >
       >;
    #else
       using VectorPairs = ::testing::Types<
-         Pair< StaticVector< 1, int >,     StaticVector< 1, short >    >,
+         Pair< StaticVector< 1, int    >,  StaticVector< 1, short  > >,
          Pair< StaticVector< 1, double >,  StaticVector< 1, double > >,
-         Pair< StaticVector< 2, int >,     StaticVector< 2, short >    >,
+         Pair< StaticVector< 2, int    >,  StaticVector< 2, short  > >,
          Pair< StaticVector< 2, double >,  StaticVector< 2, double > >,
-         Pair< StaticVector< 3, int >,     StaticVector< 3, short >    >,
+         Pair< StaticVector< 3, int    >,  StaticVector< 3, short  > >,
          Pair< StaticVector< 3, double >,  StaticVector< 3, double > >,
-         Pair< StaticVector< 4, int >,     StaticVector< 4, short >    >,
+         Pair< StaticVector< 4, int    >,  StaticVector< 4, short  > >,
          Pair< StaticVector< 4, double >,  StaticVector< 4, double > >,
-         Pair< StaticVector< 5, int >,     StaticVector< 5, short >    >,
-         Pair< StaticVector< 5, double >,  StaticVector< 5, double > >
+         Pair< StaticVector< 5, int    >,  StaticVector< 5, CustomScalar< short > > >,
+         Pair< StaticVector< 5, double >,  StaticVector< 5, CustomScalar< double > > >
       >;
    #endif
 #else
@@ -217,33 +219,25 @@ protected:
    #else
       using VectorPairs = ::testing::Types<
       #ifndef HAVE_CUDA
-         Pair< Vector<     int,       Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, Vector<     int,       Devices::Host > >,
-         Pair< Vector<     int,       Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< Vector<     int,       Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, VectorView< int,       Devices::Host > >,
-         Pair< VectorView< const int, Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< VectorView< int,       Devices::Host >, VectorView< const int, Devices::Host > >,
-         Pair< Vector<     double,    Devices::Host >, Vector<     double,    Devices::Host > >,
-         Pair< VectorView< double,    Devices::Host >, Vector<     double,    Devices::Host > >,
-         Pair< Vector<     double,    Devices::Host >, VectorView< double,    Devices::Host > >,
-         Pair< VectorView< double,    Devices::Host >, VectorView< double,    Devices::Host > >
+         Pair< Vector<     int,                 Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< VectorView< int,                 Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, Vector<     int,                          Devices::Host > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Host >, VectorView< CustomScalar< double >,       Devices::Host > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Host >, VectorView< const CustomScalar< double >, Devices::Host > >,
+         Pair< VectorView< CustomScalar< int >, Devices::Host >, VectorView< CustomScalar< double >,       Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, VectorView< int,                          Devices::Host > >,
+         Pair< VectorView< const int,           Devices::Host >, VectorView< const int,                    Devices::Host > >,
+         Pair< VectorView< int,                 Devices::Host >, VectorView< const int,                    Devices::Host > >
       #else
-         Pair< Vector<     int,       Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, Vector<     int,       Devices::Cuda > >,
-         Pair< Vector<     int,       Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< Vector<     int,       Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, VectorView< int,       Devices::Cuda > >,
-         Pair< VectorView< const int, Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< VectorView< int,       Devices::Cuda >, VectorView< const int, Devices::Cuda > >,
-         Pair< Vector<     double,    Devices::Cuda >, Vector<     double,    Devices::Cuda > >,
-         Pair< VectorView< double,    Devices::Cuda >, Vector<     double,    Devices::Cuda > >,
-         Pair< Vector<     double,    Devices::Cuda >, VectorView< double,    Devices::Cuda > >,
-         Pair< VectorView< double,    Devices::Cuda >, VectorView< double,    Devices::Cuda > >
+         Pair< Vector<     int,                 Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< VectorView< int,                 Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, Vector<     int,                          Devices::Cuda > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Cuda >, VectorView< CustomScalar< double >,       Devices::Cuda > >,
+         Pair< Vector<     CustomScalar< int >, Devices::Cuda >, VectorView< const CustomScalar< double >, Devices::Cuda > >,
+         Pair< VectorView< CustomScalar< int >, Devices::Cuda >, VectorView< CustomScalar< double >,       Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, VectorView< int,                          Devices::Cuda > >,
+         Pair< VectorView< const int,           Devices::Cuda >, VectorView< const int,                    Devices::Cuda > >,
+         Pair< VectorView< int,                 Devices::Cuda >, VectorView< const int,                    Devices::Cuda > >
       #endif
       >;
    #endif
diff --git a/src/UnitTests/Containers/VectorHelperFunctions.h b/src/UnitTests/Containers/VectorHelperFunctions.h
index 32f2d52ba7d8cdba98eb671e57c5a7ae1de64583..1915f535da232c4ad2042a1bc229ee7cfac9b0e6 100644
--- a/src/UnitTests/Containers/VectorHelperFunctions.h
+++ b/src/UnitTests/Containers/VectorHelperFunctions.h
@@ -28,17 +28,16 @@ void setLinearSequence( Vector& deviceVector )
    deviceVector = a;
 }
 
-template< typename Vector >
-void setConstantSequence( Vector& deviceVector,
-                          typename Vector::RealType v )
+template< typename Vector, typename Value >
+void setConstantSequence( Vector& deviceVector, Value v )
 {
-   deviceVector.setValue( v );
+   deviceVector.setValue( typename Vector::ValueType( v ) );
 }
 
 template< typename Vector >
 void setOscilatingLinearSequence( Vector& deviceVector )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
    for( int i = 0; i < a.getSize(); i++ )
@@ -47,10 +46,9 @@ void setOscilatingLinearSequence( Vector& deviceVector )
 }
 
 template< typename Vector >
-void setOscilatingConstantSequence( Vector& deviceVector,
-                                    typename Vector::RealType v )
+void setOscilatingConstantSequence( Vector& deviceVector )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
    for( int i = 0; i < a.getSize(); i++ )
@@ -61,7 +59,7 @@ void setOscilatingConstantSequence( Vector& deviceVector,
 template< typename Vector >
 void setNegativeLinearSequence( Vector& deviceVector )
 {
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #ifdef DISTRIBUTED_VECTOR
@@ -78,14 +76,13 @@ void setNegativeLinearSequence( Vector& deviceVector )
    deviceVector = a;
 }
 
-template< typename Vector >
-void setOscilatingSequence( Vector& deviceVector,
-                            typename Vector::RealType v )
+template< typename Vector, typename Value >
+void setOscilatingSequence( Vector& deviceVector, Value v )
 {
 #ifdef STATIC_VECTOR
    Vector a;
 #else
-   using HostVector = typename Vector::template Self< typename Vector::RealType, TNL::Devices::Host >;
+   using HostVector = typename Vector::template Self< typename Vector::ValueType, TNL::Devices::Host >;
    HostVector a;
    a.setLike( deviceVector );
 #endif
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.cpp b/src/UnitTests/Containers/VectorPrefixSumTest.cpp
deleted file mode 100644
index 41ae5f65f082b27774cd6c31d249e7a763063634..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTest.cpp
+++ /dev/null
@@ -1 +0,0 @@
-#include "VectorPrefixSumTest.h"
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
deleted file mode 100644
index 3c52e9eeff04af5d4bb26ea04edf2231b63f7546..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ /dev/null
@@ -1,347 +0,0 @@
-/***************************************************************************
-                          VectorTest-2.h  -  description
-                             -------------------
-    begin                : Oct 25, 2010
-    copyright            : (C) 2010 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#ifdef HAVE_GTEST
-#include "VectorTestSetup.h"
-
-// should be small enough to have fast tests, but larger than minGPUReductionDataSize
-// and large enough to require multiple CUDA blocks for reduction
-constexpr int VECTOR_TEST_SIZE = 10000;
-
-TYPED_TEST( VectorTest, scan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using HostVectorType = typename VectorType::template Self< RealType, Devices::Sequential >;
-   const int size = VECTOR_TEST_SIZE;
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   VectorType v( size );
-   ViewType v_view( v );
-   HostVectorType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.scan();
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.scan();
-   v_host = v_view;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.scan();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.scan();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Inclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-TYPED_TEST( VectorTest, exclusiveScan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using HostVectorType = typename VectorType::template Self< RealType, Devices::Sequential >;
-   const int size = VECTOR_TEST_SIZE;
-
-   // FIXME: tests should work in all cases
-   if( std::is_same< RealType, float >::value )
-      return;
-
-   VectorType v;
-   v.setSize( size );
-   ViewType v_view( v );
-   HostVectorType v_host( size );
-
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   // test views
-   setConstantSequence( v, 0 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-   setConstantSequence( v, 1 );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-   setLinearSequence( v );
-   v_host = -1;
-   v_view.template scan< Algorithms::ScanType::Exclusive >();
-   v_host = v;
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-   ////
-   // With CUDA, perform tests with multiple CUDA grids.
-   if( std::is_same< DeviceType, Devices::Cuda >::value )
-   {
-#ifdef HAVE_CUDA
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
-
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      // test views
-      setConstantSequence( v, 0 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
-
-      setConstantSequence( v, 1 );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
-
-      setLinearSequence( v );
-      v_host = -1;
-      v_view.template scan< Algorithms::ScanType::Exclusive >();
-      EXPECT_GT( ( Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v;
-      for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
-
-      Algorithms::detail::CudaScanKernelLauncher< Algorithms::ScanType::Exclusive, RealType, IndexType >::resetMaxGridSize();
-#endif
-   }
-}
-
-// TODO: test scan with custom begin and end parameters
-
-
-template< typename FlagsView >
-void setupFlags( FlagsView& f )
-{
-   auto f1 = [] __cuda_callable__ ( typename FlagsView::IndexType i ) { return ( i % 5 ) == 0; };
-   f.evaluate( f1 );
-}
-
-/*
-TYPED_TEST( VectorTest, segmentedScan )
-{
-   using VectorType = typename TestFixture::VectorType;
-   using ViewType = typename TestFixture::ViewType;
-   using RealType = typename VectorType::RealType;
-   using DeviceType = typename VectorType::DeviceType;
-   using IndexType = typename VectorType::IndexType;
-   using FlagsArrayType = Array< bool, DeviceType, IndexType >;
-   using FlagsViewType = ArrayView< bool, DeviceType, IndexType >;
-   const int size = VECTOR_TEST_SIZE;
-
-   VectorType v( size );
-   ViewType v_view( v );
-
-   FlagsArrayType flags( size ), flags_copy( size );
-   FlagsViewType flags_view( flags );
-   //auto f1 = [] __cuda_callable__ ( IndexType i ) { return ( i % 5 ) == 0; };
-   //flags_view.evaluate( f1 );
-   setupFlags( flags_view );
-   flags_copy = flags_view;
-
-   v = 0;
-   v.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v.getElement( i ), 0 );
-   flags_view = flags_copy;
-
-   v = 1;
-   v.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v.getElement( i ), ( i % 5 ) + 1 );
-   flags_view = flags_copy;
-
-   setLinearSequence( v );
-   v.computeSegmentedScan( flags_view );
-   for( int i = 1; i < size; i++ )
-   {
-      if( flags.getElement( i ) )
-         EXPECT_EQ( v.getElement( i ), i );
-      else
-         EXPECT_EQ( v.getElement( i ) - v.getElement( i - 1 ), i );
-   }
-   flags_view = flags_copy;
-
-   v_view = 0;
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_view.getElement( i ), 0 );
-   flags_view = flags_copy;
-
-   v_view = 1;
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_view.getElement( i ), ( i % 5 ) + 1 );
-   flags_view = flags_copy;
-
-   //v_view.evaluate( [] __cuda_callable__ ( IndexType i ) { return i; } );
-   setLinearSequence( v );
-   v_view.computeSegmentedScan( flags_view );
-   for( int i = 1; i < size; i++ )
-   {
-      if( flags.getElement( i ) )
-         EXPECT_EQ( v_view.getElement( i ), i );
-      else
-         EXPECT_EQ( v_view.getElement( i ) - v_view.getElement( i - 1 ), i );
-   }
-}
-*/
-
-#endif // HAVE_GTEST
-
-#include "../main.h"
diff --git a/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu b/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu
deleted file mode 100644
index 41ae5f65f082b27774cd6c31d249e7a763063634..0000000000000000000000000000000000000000
--- a/src/UnitTests/Containers/VectorPrefixSumTestCuda.cu
+++ /dev/null
@@ -1 +0,0 @@
-#include "VectorPrefixSumTest.h"
diff --git a/src/UnitTests/Containers/VectorTest.h b/src/UnitTests/Containers/VectorTest.h
index 8dd7d270ace21e2d7c9c5ad91c40f8cfacce90fa..ea1676b67f2e38471dae5f45c7088c0c8928856d 100644
--- a/src/UnitTests/Containers/VectorTest.h
+++ b/src/UnitTests/Containers/VectorTest.h
@@ -80,31 +80,6 @@ TYPED_TEST( VectorTest, constructors )
 
 }
 
-// test must be in a plain function because nvcc sucks (extended lambdas are
-// not allowed to be defined in protected class member functions)
-template< typename VectorType >
-void testVectorReduceElements()
-{
-   using IndexType = typename VectorType::IndexType;
-   using ValueType = typename VectorType::ValueType;
-
-   VectorType a( 10 );
-   a.forAllElements( [=] __cuda_callable__ ( IndexType i, ValueType& v ) mutable { v = 1; } );
-   auto fetch = [] __cuda_callable__ ( IndexType i, ValueType& v ) -> ValueType { return v; };
-   auto reduce = [] __cuda_callable__ ( const ValueType v1, const ValueType v2 ) { return v1 + v2; };
-   EXPECT_EQ( a.reduceEachElement( fetch, reduce, ( ValueType ) 0.0 ),
-              a.getSize() );
-
-   const VectorType b( a );
-   auto const_fetch = [] __cuda_callable__ ( IndexType i, const ValueType& v ) -> ValueType { return v; };
-   EXPECT_EQ( b.reduceEachElement( const_fetch, reduce, ( ValueType ) 0.0 ),
-              b.getSize() );
-}
-TYPED_TEST( VectorTest, reduceElements )
-{
-   testVectorReduceElements< typename TestFixture::VectorType >();
-}
-
 TEST( VectorSpecialCasesTest, defaultConstructors )
 {
    #ifdef HAVE_CUDA
@@ -261,7 +236,7 @@ TEST( VectorSpecialCasesTest, reductionOfEmptyVector )
    EXPECT_EQ( product(v), 1 );
    EXPECT_EQ( logicalAnd(v), true );
    EXPECT_EQ( logicalOr(v), false );
-   EXPECT_EQ( binaryAnd(v), std::numeric_limits< int >::max() );
+   EXPECT_EQ( binaryAnd(v), ~0 );
    EXPECT_EQ( binaryOr(v), 0 );
 
    EXPECT_EQ( min(v_view), std::numeric_limits< int >::max() );
@@ -272,7 +247,7 @@ TEST( VectorSpecialCasesTest, reductionOfEmptyVector )
    EXPECT_EQ( product(v_view), 1 );
    EXPECT_EQ( logicalAnd(v_view), true );
    EXPECT_EQ( logicalOr(v_view), false );
-   EXPECT_EQ( binaryAnd(v_view), std::numeric_limits< int >::max() );
+   EXPECT_EQ( binaryAnd(v_view), ~0 );
    EXPECT_EQ( binaryOr(v_view), 0 );
 }
 
diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h
index c9863009b372e8c52151b89e42e3edd7d268dd86..7141466b80fee19588945d30be9a15ceba6bcfa1 100644
--- a/src/UnitTests/Containers/VectorTestSetup.h
+++ b/src/UnitTests/Containers/VectorTestSetup.h
@@ -37,6 +37,25 @@ protected:
 // TODO: Quad must be fixed
 using VectorTypes = ::testing::Types<
 #ifndef HAVE_CUDA
+   Vector< int,            Devices::Sequential, short >,
+   Vector< long,           Devices::Sequential, short >,
+   Vector< float,          Devices::Sequential, short >,
+   Vector< double,         Devices::Sequential, short >,
+   //Vector< Quad< float >,  Devices::Sequential, short >,
+   //Vector< Quad< double >, Devices::Sequential, short >,
+   Vector< int,            Devices::Sequential, int >,
+   Vector< long,           Devices::Sequential, int >,
+   Vector< float,          Devices::Sequential, int >,
+   Vector< double,         Devices::Sequential, int >,
+   //Vector< Quad< float >,  Devices::Sequential, int >,
+   //Vector< Quad< double >, Devices::Sequential, int >,
+   Vector< int,            Devices::Sequential, long >,
+   Vector< long,           Devices::Sequential, long >,
+   Vector< float,          Devices::Sequential, long >,
+   Vector< double,         Devices::Sequential, long >,
+   //Vector< Quad< float >,  Devices::Sequential, long >,
+   //Vector< Quad< double >, Devices::Sequential, long >,
+
    Vector< int,            Devices::Host, short >,
    Vector< long,           Devices::Host, short >,
    Vector< float,          Devices::Host, short >,
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index 43e2e2687b0c793f386aba4886d59fcfe8139cb5..eb3c656339ba9ba672a8ae3318f00100aa62c8c3 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -76,11 +77,13 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int >,
       DistributedVectorView<       double, Devices::Host, int >,
-      DistributedVectorView< const double, Devices::Host, int >
+      DistributedVectorView< const double, Devices::Host, int >,
+      DistributedVector< CustomScalar< double >, Devices::Host, int >
    #else
       DistributedVector<           double, Devices::Cuda, int >,
       DistributedVectorView<       double, Devices::Cuda, int >,
-      DistributedVectorView< const double, Devices::Cuda, int >
+      DistributedVectorView< const double, Devices::Cuda, int >,
+      DistributedVector< CustomScalar< double >, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -90,7 +93,7 @@ protected:
          StaticVector< 2, StaticVector< 3, double > >,
          StaticVector< 3, StaticVector< 3, double > >,
          StaticVector< 4, StaticVector< 3, double > >,
-         StaticVector< 5, StaticVector< 3, double > >
+         StaticVector< 5, StaticVector< 3, CustomScalar< double > > >
       >;
    #else
       using VectorTypes = ::testing::Types<
@@ -102,8 +105,8 @@ protected:
          StaticVector< 3, double >,
          StaticVector< 4, int >,
          StaticVector< 4, double >,
-         StaticVector< 5, int >,
-         StaticVector< 5, double >
+         StaticVector< 5, CustomScalar< int > >,
+         StaticVector< 5, CustomScalar< double > >
       >;
    #endif
 #else
@@ -111,10 +114,12 @@ protected:
       using VectorTypes = ::testing::Types<
       #ifndef HAVE_CUDA
          Vector<     StaticVector< 3, double >, Devices::Host >,
-         VectorView< StaticVector< 3, double >, Devices::Host >
+         VectorView< StaticVector< 3, double >, Devices::Host >,
+         VectorView< StaticVector< 3, CustomScalar< double > >, Devices::Host >
       #else
          Vector<     StaticVector< 3, double >, Devices::Cuda >,
-         VectorView< StaticVector< 3, double >, Devices::Cuda >
+         VectorView< StaticVector< 3, double >, Devices::Cuda >,
+         VectorView< StaticVector< 3, CustomScalar< double > >, Devices::Cuda >
       #endif
       >;
    #else
@@ -124,14 +129,18 @@ protected:
          VectorView< int,       Devices::Host >,
          VectorView< const int, Devices::Host >,
          Vector<     double,    Devices::Host >,
-         VectorView< double,    Devices::Host >
+         VectorView< double,    Devices::Host >,
+         Vector<     CustomScalar< int >, Devices::Host >,
+         VectorView< CustomScalar< int >, Devices::Host >
       #endif
       #ifdef HAVE_CUDA
          Vector<     int,       Devices::Cuda >,
          VectorView< int,       Devices::Cuda >,
          VectorView< const int, Devices::Cuda >,
          Vector<     double,    Devices::Cuda >,
-         VectorView< double,    Devices::Cuda >
+         VectorView< double,    Devices::Cuda >,
+         Vector<     CustomScalar< int >, Devices::Cuda >,
+         VectorView< CustomScalar< int >, Devices::Cuda >
       #endif
       >;
    #endif
@@ -164,7 +173,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / _size;         \
       for( int i = 0; i < _size; i++ )                         \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          V1[ i ] = x;                                          \
          expected[ i ] = function(x);                          \
       }                                                        \
@@ -209,7 +218,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / size;          \
       for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
@@ -229,11 +238,12 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
    #define SETUP_UNARY_VECTOR_TEST( size ) \
       using VectorType = typename TestFixture::VectorType;     \
       using VectorOrView = typename TestFixture::VectorOrView; \
+      using ValueType = typename VectorType::ValueType;        \
                                                                \
       VectorType _V1( size ), _V2( size );                     \
                                                                \
-      _V1 = 1;                                                 \
-      _V2 = 2;                                                 \
+      _V1 = ValueType( 1 );                                    \
+      _V2 = ValueType( 2 );                                    \
                                                                \
       VectorOrView V1( _V1 ), V2( _V2 );                       \
 
@@ -251,7 +261,7 @@ TYPED_TEST_SUITE( VectorUnaryOperationsTest, VectorTypes );
       const double h = (double) (end - begin) / size;          \
       for( int i = 0; i < size; i++ )                          \
       {                                                        \
-         const RealType x = begin + i * h;                     \
+         const RealType x = begin + RealType( i * h );         \
          _V1h[ i ] = x;                                        \
          expected_h[ i ] = function(x);                        \
       }                                                        \
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index f73b502ccc9ee12bb812acf963b680991c11a877..b201f563df191e70ac4fc8ad731b2a69984156ad 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -28,6 +28,7 @@
 #endif
 
 #include "VectorHelperFunctions.h"
+#include "../CustomScalar.h"
 
 #include "gtest/gtest.h"
 
@@ -112,11 +113,13 @@ protected:
    #ifndef HAVE_CUDA
       DistributedVector<           double, Devices::Host, int >,
       DistributedVectorView<       double, Devices::Host, int >,
-      DistributedVectorView< const double, Devices::Host, int >
+      DistributedVectorView< const double, Devices::Host, int >,
+      DistributedVector< CustomScalar< double >, Devices::Host, int >
    #else
       DistributedVector<           double, Devices::Cuda, int >,
       DistributedVectorView<       double, Devices::Cuda, int >,
-      DistributedVectorView< const double, Devices::Cuda, int >
+      DistributedVectorView< const double, Devices::Cuda, int >,
+      DistributedVector< CustomScalar< double >, Devices::Cuda, int >
    #endif
    >;
 #elif defined(STATIC_VECTOR)
@@ -126,7 +129,7 @@ protected:
          StaticVector< 2, StaticVector< 3, double > >,
          StaticVector< 3, StaticVector< 3, double > >,
          StaticVector< 4, StaticVector< 3, double > >,
-         StaticVector< 5, StaticVector< 3, double > >
+         StaticVector< 5, StaticVector< 3, CustomScalar< double > > >
       >;
    #else
       using VectorTypes = ::testing::Types<
@@ -138,8 +141,8 @@ protected:
          StaticVector< 3, double >,
          StaticVector< 4, int >,
          StaticVector< 4, double >,
-         StaticVector< 5, int >,
-         StaticVector< 5, double >
+         StaticVector< 5, CustomScalar< int > >,
+         StaticVector< 5, CustomScalar< double > >
       >;
    #endif
 #else
@@ -160,14 +163,18 @@ protected:
          VectorView< int,       Devices::Host >,
          VectorView< const int, Devices::Host >,
          Vector<     double,    Devices::Host >,
-         VectorView< double,    Devices::Host >
+         VectorView< double,    Devices::Host >,
+         Vector<     CustomScalar< int >, Devices::Host >,
+         VectorView< CustomScalar< int >, Devices::Host >
       #endif
       #ifdef HAVE_CUDA
          Vector<     int,       Devices::Cuda >,
          VectorView< int,       Devices::Cuda >,
          VectorView< const int, Devices::Cuda >,
          Vector<     double,    Devices::Cuda >,
-         VectorView< double,    Devices::Cuda >
+         VectorView< double,    Devices::Cuda >,
+         Vector<     CustomScalar< int >, Devices::Cuda >,
+         VectorView< CustomScalar< int >, Devices::Cuda >
       #endif
       >;
    #endif
diff --git a/src/UnitTests/CustomScalar.h b/src/UnitTests/CustomScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e957750ab4a49cb0bc19389bb653039f346b275
--- /dev/null
+++ b/src/UnitTests/CustomScalar.h
@@ -0,0 +1,267 @@
+#pragma once
+
+#include <iostream>
+#include <limits>
+
+#include <TNL/Math.h>
+#include <TNL/MPI/getDataType.h>
+
+namespace TNL {
+
+template< class T >
+struct CustomScalar
+{
+   T value = 0;
+
+public:
+   constexpr CustomScalar() = default;
+
+   constexpr CustomScalar( T value ) : value( value ) {}
+
+   template< typename S >
+   constexpr CustomScalar( const CustomScalar< S >& v ) : value( v.value ) {}
+
+   constexpr CustomScalar( const CustomScalar& ) = default;
+
+   constexpr CustomScalar( CustomScalar&& ) = default;
+
+   constexpr CustomScalar& operator=( const CustomScalar& v ) = default;
+
+   constexpr CustomScalar& operator=( CustomScalar&& v ) = default;
+
+#define MAKE_ASSIGNMENT_OP(op) \
+   template< typename S >                                            \
+   constexpr CustomScalar& operator op( const CustomScalar< S >& v ) \
+   {                                                                 \
+      value op v.value;                                              \
+      return *this;                                                  \
+   }                                                                 \
+   template< typename S >                                            \
+   constexpr CustomScalar& operator op( const S& v )                 \
+   {                                                                 \
+      value op v;                                                    \
+      return *this;                                                  \
+   }                                                                 \
+
+   MAKE_ASSIGNMENT_OP(+=)
+   MAKE_ASSIGNMENT_OP(-=)
+   MAKE_ASSIGNMENT_OP(*=)
+   MAKE_ASSIGNMENT_OP(/=)
+   MAKE_ASSIGNMENT_OP(%=)
+   MAKE_ASSIGNMENT_OP(&=)
+   MAKE_ASSIGNMENT_OP(|=)
+   MAKE_ASSIGNMENT_OP(^=)
+   MAKE_ASSIGNMENT_OP(<<=)
+   MAKE_ASSIGNMENT_OP(>>=)
+
+#undef MAKE_ASSIGNMENT_OP
+
+   // bitwise negation
+   constexpr bool operator~() const
+   {
+      return ~ value;
+   }
+
+   // logical negation
+   constexpr bool operator!() const
+   {
+      return ! value;
+   }
+
+   // unary plus (integer promotion)
+   constexpr auto operator+() const -> CustomScalar< decltype(+value) >
+   {
+      return +value;
+   }
+
+   // unary minus (additive inverse)
+   constexpr CustomScalar operator-() const
+   {
+      return -value;
+   }
+
+   // prefix increment
+   constexpr CustomScalar& operator++()
+   {
+      ++value;
+      return *this;
+   }
+
+   // prefix decrement
+   constexpr CustomScalar& operator--()
+   {
+      --value;
+      return *this;
+   }
+
+   // postfix increment
+   constexpr CustomScalar operator++(int)
+   {
+      CustomScalar result = *this;
+      value++;
+      return result;
+   }
+
+   // postfix decrement
+   constexpr CustomScalar operator--(int)
+   {
+      CustomScalar result = *this;
+      value--;
+      return result;
+   }
+
+   // cast to T
+   constexpr operator T() const
+   {
+      return value;
+   }
+};
+
+#define MAKE_BINARY_OP(op)                                              \
+template< class T, class S >                                            \
+constexpr auto operator op( const CustomScalar< T >& v1,                \
+                            const CustomScalar< S >& v2 )               \
+   -> CustomScalar< decltype( v1.value op v2.value ) >                  \
+{                                                                       \
+   return v1.value op v2.value;                                         \
+}                                                                       \
+template< class T, class S >                                            \
+constexpr auto operator op( const CustomScalar< T >& v1, const S& v2 )  \
+   -> CustomScalar< decltype( v1.value op v2 ) >                        \
+{                                                                       \
+   return v1.value op v2;                                               \
+}                                                                       \
+template< class S, class T >                                            \
+constexpr auto operator op( const S& v1, const CustomScalar< T >& v2 )  \
+   -> CustomScalar< decltype( v1 op v2.value ) >                        \
+{                                                                       \
+   return v1 op v2.value;                                               \
+}                                                                       \
+
+MAKE_BINARY_OP(+)
+MAKE_BINARY_OP(-)
+MAKE_BINARY_OP(*)
+MAKE_BINARY_OP(/)
+MAKE_BINARY_OP(%)
+MAKE_BINARY_OP(&)
+MAKE_BINARY_OP(|)
+MAKE_BINARY_OP(^)
+MAKE_BINARY_OP(<<)
+MAKE_BINARY_OP(>>)
+
+#undef MAKE_BINARY_OP
+
+#define MAKE_BOOL_BINARY_OP(op)                                         \
+template< class T, class S >                                            \
+constexpr bool operator op( const CustomScalar< T >& v1,                \
+                            const CustomScalar< S >& v2 )               \
+{                                                                       \
+   return v1.value op v2.value;                                         \
+}                                                                       \
+template< class T, class S >                                            \
+constexpr bool operator op( const CustomScalar< T >& v1, const S& v2 )  \
+{                                                                       \
+   return v1.value op v2;                                               \
+}                                                                       \
+template< class S, class T >                                            \
+constexpr bool operator op( const S& v1, const CustomScalar< T >& v2 )  \
+{                                                                       \
+   return v1 op v2.value;                                               \
+}                                                                       \
+
+MAKE_BOOL_BINARY_OP(==)
+MAKE_BOOL_BINARY_OP(!=)
+MAKE_BOOL_BINARY_OP(<=)
+MAKE_BOOL_BINARY_OP(>=)
+MAKE_BOOL_BINARY_OP(<)
+MAKE_BOOL_BINARY_OP(>)
+MAKE_BOOL_BINARY_OP(&&)
+MAKE_BOOL_BINARY_OP(||)
+
+#undef MAKE_BOOL_BINARY_OP
+
+template< class T >
+std::istream& operator>>( std::istream& str, const CustomScalar< T >& v )
+{
+   return str >> v.value;
+}
+
+template< class T >
+std::ostream& operator<<( std::ostream& str, const CustomScalar< T >& v )
+{
+   return str << v.value;
+}
+
+#define MAKE_UNARY_FUNC(fname)                              \
+   template< class T >                                      \
+   constexpr auto fname ( const CustomScalar< T >& v )      \
+      -> CustomScalar< decltype(TNL::fname( v.value )) >    \
+   { return TNL::fname( v.value ); }                        \
+
+#define MAKE_BINARY_FUNC(fname)                                                     \
+   template< class T, class S >                                                     \
+   constexpr auto fname ( const CustomScalar< T >& v, const CustomScalar< S >& w )  \
+      -> CustomScalar< decltype(TNL::fname( v.value, w.value )) >                   \
+   { return TNL::fname( v.value, w.value ); }                                       \
+   template< class T, class S >                                                     \
+   constexpr auto fname ( const CustomScalar< T >& v, const S& w )                  \
+      -> CustomScalar< decltype(TNL::fname( v.value, w )) >                         \
+   { return TNL::fname( v.value, w ); }                                             \
+   template< class S, class T >                                                     \
+   constexpr auto fname ( const S& w, const CustomScalar< T >& v )                  \
+      -> CustomScalar< decltype(TNL::fname( w, v.value )) >                         \
+   { return TNL::fname( w, v.value ); }                                             \
+
+MAKE_UNARY_FUNC( abs )
+MAKE_UNARY_FUNC( sqrt )
+MAKE_UNARY_FUNC( cbrt )
+MAKE_UNARY_FUNC( exp )
+MAKE_UNARY_FUNC( log )
+MAKE_UNARY_FUNC( log10 )
+MAKE_UNARY_FUNC( log2 )
+MAKE_UNARY_FUNC( sin )
+MAKE_UNARY_FUNC( cos )
+MAKE_UNARY_FUNC( tan )
+MAKE_UNARY_FUNC( asin )
+MAKE_UNARY_FUNC( acos )
+MAKE_UNARY_FUNC( atan )
+MAKE_UNARY_FUNC( sinh )
+MAKE_UNARY_FUNC( cosh )
+MAKE_UNARY_FUNC( tanh )
+MAKE_UNARY_FUNC( asinh )
+MAKE_UNARY_FUNC( acosh )
+MAKE_UNARY_FUNC( atanh )
+MAKE_UNARY_FUNC( floor )
+MAKE_UNARY_FUNC( ceil )
+
+MAKE_BINARY_FUNC( min )
+MAKE_BINARY_FUNC( max )
+MAKE_BINARY_FUNC( argMin )
+MAKE_BINARY_FUNC( argMax )
+MAKE_BINARY_FUNC( argAbsMin )
+MAKE_BINARY_FUNC( argAbsMax )
+MAKE_BINARY_FUNC( pow )
+
+#undef MAKE_UNARY_FUNC
+#undef MAKE_BINARY_FUNC
+
+} // namespace TNL
+
+namespace std {
+   template< typename T >
+   struct numeric_limits< TNL::CustomScalar< T > > : public numeric_limits< T > {};
+} // namespace std
+
+namespace TNL {
+   template< typename T >
+   struct IsScalarType< CustomScalar< T > > : public std::true_type {};
+} // namespace TNL
+
+#ifdef HAVE_MPI
+namespace TNL {
+namespace MPI {
+   template< typename T >
+   struct TypeResolver< CustomScalar< T > > : public TypeResolver< T > {};
+} // namespace MPI
+} // namespace TNL
+#endif
diff --git a/src/UnitTests/CustomScalarTest.cpp b/src/UnitTests/CustomScalarTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c650e050f1c11d76fb9c06839deb79cf7305c706
--- /dev/null
+++ b/src/UnitTests/CustomScalarTest.cpp
@@ -0,0 +1,43 @@
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+
+#include "CustomScalar.h"
+
+using scalar = TNL::CustomScalar< int >;
+
+TEST( CustomScalarTest, comparison )
+{
+   scalar a = 1;
+   EXPECT_EQ( a, 1 );
+   EXPECT_EQ( 1, a );
+   EXPECT_NE( a, 2 );
+   EXPECT_NE( 2, a );
+   EXPECT_LE( a, 1 );
+   EXPECT_LE( 1, a );
+   EXPECT_GE( a, 1 );
+   EXPECT_GE( 1, a );
+   EXPECT_LT( a, 2 );
+   EXPECT_LT( 0, a );
+   EXPECT_GT( a, 0 );
+   EXPECT_GT( 2, a );
+
+   scalar b = 1.0;
+   EXPECT_EQ( b, 1.0 );
+   EXPECT_EQ( 1.0, b );
+   EXPECT_NE( b, 2.0 );
+   EXPECT_NE( 2.0, b );
+   EXPECT_LE( b, 1.0 );
+   EXPECT_LE( 1.0, b );
+   EXPECT_GE( b, 1.0 );
+   EXPECT_GE( 1.0, b );
+   EXPECT_LT( b, 2.0 );
+   EXPECT_LT( 0.0, b );
+   EXPECT_GT( b, 0.0 );
+   EXPECT_GT( 2.0, b );
+}
+
+// TODO: test the other operators
+
+#endif
+
+#include "main.h"
diff --git a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
index 0f2b00595b2e7c98f696a460adce304497545f67..9bfd551be13cdc7484461a30a4e3061c8b39e3fd 100644
--- a/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/BinarySparseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -492,7 +493,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, TNL::min( i + j, 1 ) );
 
    Matrix matrix;
@@ -509,7 +510,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), TNL::min( i + j, 1 ) );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -524,7 +525,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), TNL::min( i + j, 1 ) );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
diff --git a/src/UnitTests/Matrices/DenseMatrixCopyTest.h b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
index fb1277ea2c38752f79fcdc6aa4eccaa7db0d0a18..b0cc4d9ac7d9f440f30420815f41d7d4d637cdcd 100644
--- a/src/UnitTests/Matrices/DenseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -458,7 +459,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, i + j );
 
    Matrix matrix;
@@ -471,7 +472,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -486,7 +487,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index ef7d077a536d543f29a17f5c8086469139a572be..ceb7ae358855a35d3dba2848375a76f8994fe88b 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -17,7 +17,7 @@
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Algorithms/ParallelFor.h>
-#include <TNL/Algorithms/Reduction.h>
+#include <TNL/Algorithms/reduce.h>
 #include <TNL/Math.h>
 
 using Dense_host_float = TNL::Matrices::DenseMatrix< float, TNL::Devices::Host, int >;
diff --git a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
index 0f2a4a63296711fbc102fecea420b3728b1a82a1..b4437a55521e48e9073cb0a8000b31ab380570c0 100644
--- a/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
+++ b/src/UnitTests/Matrices/MultidiagonalMatrixTest.h
@@ -8,16 +8,16 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#include <iostream>
 #include <sstream>
 #include <TNL/Devices/Host.h>
 #include <TNL/Matrices/Matrix.h>
 #include <TNL/Matrices/MultidiagonalMatrix.h>
+#include <TNL/Algorithms/contains.h>
 #include <TNL/Containers/Array.h>
-
 #include <TNL/Containers/Vector.h>
 #include <TNL/Containers/VectorView.h>
 #include <TNL/Math.h>
-#include <iostream>
 
 using Multidiagonal_host_float = TNL::Matrices::MultidiagonalMatrix< float, TNL::Devices::Host, int >;
 using Multidiagonal_host_int = TNL::Matrices::MultidiagonalMatrix< int, TNL::Devices::Host, int >;
@@ -174,7 +174,7 @@ void test_SetElements()
          {
             for( int k = 0; k < matrixSize; k++ )
             {
-               if( k == elementIdx - gridSize || 
+               if( k == elementIdx - gridSize ||
                    k == elementIdx - 1 ||
                    k == elementIdx + 1 ||
                    k == elementIdx + gridSize )
@@ -403,7 +403,7 @@ void test_SetElement()
    RealType value = 1;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.setElement( i, j, value++ );
          else
          {
@@ -466,7 +466,7 @@ void test_AddElement()
    RealType value = 1;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
          {
             if( j >= i )
                m.setElement( i, j, value );
@@ -524,7 +524,7 @@ void test_AddElement()
    RealType multiplicator = 2;
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.addElement( i, j, value++, multiplicator );
          else
          {
@@ -669,7 +669,7 @@ void test_AddRow()
       for( IndexType j = 0; j < cols; j++ )
       {
          IndexType offset = j - i;
-         if( diagonals.containsValue( offset ) && offset >= 0)
+         if( TNL::Algorithms::contains( diagonals, offset ) && offset >= 0)
             m.setElement( i, j, value );
          value++;
       }
@@ -883,7 +883,7 @@ void test_VectorProduct()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++)
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             m.setElement( i, j, value );
          value++;
       }
@@ -1285,7 +1285,7 @@ void test_AssignmentOperator()
    MultidiagonalHost hostMatrix( rows, columns, diagonalsOffsets );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j <  columns; j++ )
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             hostMatrix.setElement( i, j,  i + j );
 
    Matrix matrix( rows, columns, diagonalsOffsets );
@@ -1293,7 +1293,7 @@ void test_AssignmentOperator()
    matrix = hostMatrix;
    for( IndexType i = 0; i < columns; i++ )
       for( IndexType j = 0; j < rows; j++ )
-            if( diagonalsOffsets.containsValue( j - i ) )
+            if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
                EXPECT_EQ( matrix.getElement( i, j ), i + j );
             else
                EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -1302,7 +1302,7 @@ void test_AssignmentOperator()
    MultidiagonalCuda cudaMatrix( rows, columns, diagonalsOffsets );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             cudaMatrix.setElement( i, j, i + j );
 
    matrix.getValues() = 0.0;
@@ -1310,7 +1310,7 @@ void test_AssignmentOperator()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -1345,7 +1345,7 @@ void test_SaveAndLoad()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < cols; j++ )
       {
-         if( diagonalsOffsets.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonalsOffsets, j - i ) )
             savedMatrix.setElement( i, j, value );
          value++;
       }
diff --git a/src/UnitTests/Matrices/SparseMatrixCopyTest.h b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
index 098a3e0a41a43fb51223dff10452b7499fe9c58c..81a7f26c945908b7e69100f24adaf1e45ead6a8d 100644
--- a/src/UnitTests/Matrices/SparseMatrixCopyTest.h
+++ b/src/UnitTests/Matrices/SparseMatrixCopyTest.h
@@ -20,6 +20,7 @@
 #include <TNL/Algorithms/Segments/CSR.h>
 #include <TNL/Algorithms/Segments/Ellpack.h>
 #include <TNL/Algorithms/Segments/SlicedEllpack.h>
+#include <TNL/Algorithms/contains.h>
 
 template< typename Device, typename Index, typename IndexAllocator >
 using EllpackSegments = TNL::Algorithms::Segments::Ellpack< Device, Index, IndexAllocator >;
@@ -495,7 +496,7 @@ void multidiagonalMatrixAssignment()
    MultidiagonalHost hostMatrix( rows, columns, diagonals );
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             hostMatrix.setElement( i, j, i + j );
 
    Matrix matrix;
@@ -512,7 +513,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
@@ -527,7 +528,7 @@ void multidiagonalMatrixAssignment()
    for( IndexType i = 0; i < rows; i++ )
       for( IndexType j = 0; j < columns; j++ )
       {
-         if( diagonals.containsValue( j - i ) )
+         if( TNL::Algorithms::contains( diagonals, j - i ) )
             EXPECT_EQ( matrix.getElement( i, j ), i + j );
          else
             EXPECT_EQ( matrix.getElement( i, j ), 0.0 );
diff --git a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
index a0eddd162f31bc8da213a5fced8778832f75a38a..f09f222874ca4374dbdfd46b25b241859f1a29af 100644
--- a/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
+++ b/src/UnitTests/Meshes/DistributedMeshes/DistributedMeshTest.h
@@ -403,8 +403,8 @@ void validateMesh( const Mesh& mesh, const Distributor& distributor, int ghostLe
       }
       vert_offsets.setElement( distributor.nproc, 0 );
       cell_offsets.setElement( distributor.nproc, 0 );
-      vert_offsets.template scan< Algorithms::ScanType::Exclusive >();
-      cell_offsets.template scan< Algorithms::ScanType::Exclusive >();
+      Algorithms::inplaceExclusiveScan( vert_offsets );
+      Algorithms::inplaceExclusiveScan( cell_offsets );
       EXPECT_EQ( vert_offsets[ distributor.rank ], mesh.template getGlobalIndices< 0 >()[ 0 ] );
       EXPECT_EQ( cell_offsets[ distributor.rank ], mesh.template getGlobalIndices< 2 >()[ 0 ] );