diff --git a/Documentation/Examples/Containers/CMakeLists.txt b/Documentation/Examples/Containers/CMakeLists.txt
index bd7d9b714744d82bcba48e27c2c38538c70e84f1..e85546a4516a1d99f9bde21a7c9fa98a554c00d9 100644
--- a/Documentation/Examples/Containers/CMakeLists.txt
+++ b/Documentation/Examples/Containers/CMakeLists.txt
@@ -8,18 +8,38 @@ set( COMMON_EXAMPLES
          VectorExample
 )
 
+set( MPI_COMMON_EXAMPLES
+         DistributedArrayExample
+)
+
+SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
+
 if( BUILD_CUDA )
    foreach( target IN ITEMS ${COMMON_EXAMPLES} )
       cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
       add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
    endforeach()
+   if( ${BUILD_MPI} )
+      foreach( target IN ITEMS ${MPI_COMMON_EXAMPLES} )
+         cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+         add_custom_command( COMMAND "mpirun" ${mpi_test_parameters} ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+         set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+      endforeach()
+   endif( ${BUILD_MPI} )
 else()
    foreach( target IN ITEMS ${HOST_EXAMPLES} )
       add_executable( ${target} ${target}.cpp )
       add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
    endforeach()
+   if( ${BUILD_MPI} )
+      foreach( target IN ITEMS ${MPI_COMMON_EXAMPLES} )
+         add_executable( ${target} ${target}.cpp )
+         add_custom_command( COMMAND "mpirun" ${mpi_test_parameters} ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+         set( HOST_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+      endforeach( ${BUILD_MPI} )
+   endif()
 endif()
 
 IF( BUILD_CUDA )
diff --git a/Documentation/Examples/Containers/DistributedArrayExample.cpp b/Documentation/Examples/Containers/DistributedArrayExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8e191f652570771577589cd4920a1d0e3f9100f5
--- /dev/null
+++ b/Documentation/Examples/Containers/DistributedArrayExample.cpp
@@ -0,0 +1,48 @@
+#include <iostream>
+#include <TNL/Containers/Partitioner.h>
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/MPI/ScopedInitializer.h>
+
+using namespace TNL;
+using namespace std;
+
+/***
+ * The following works for any device (CPU, GPU ...).
+ */
+template< typename Device >
+void distributedArrayExample()
+{
+   using ArrayType = Containers::DistributedArray< int, Device >;
+   using LocalArrayType = Containers::Array< int, Device >;
+   using IndexType = typename ArrayType::IndexType;
+   using LocalRangeType = typename ArrayType::LocalRangeType;
+
+   const MPI_Comm group = TNL::MPI::AllGroup();
+   //const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
+
+   /***
+    * We set size to prime number to force non-uniform distribution of the distributed array.
+    */
+   const int size = 97;
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
+   const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( size, group );
+   ArrayType a( localRange, ghosts, size, group );
+   a.forElements( 0, size, [=] __cuda_callable__ ( const int idx, int& value ) { value = idx; } );
+   //LocalArrayType localArray = a;
+   //std::cout << a << std::endl;
+
+}
+
+int main( int argc, char* argv[] )
+{
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
+
+   std::cout << "The first test runs on CPU ..." << std::endl;
+   distributedArrayExample< Devices::Host >();
+#ifdef HAVE_CUDA
+   std::cout << "The second test runs on GPU ..." << std::endl;
+   distributedArrayExample< Devices::Cuda >();
+#endif
+}
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
index 2e05b167870b4c4cd33dbd3f26fe6d0bdab2c83f..a4c78f97b57a6d207643d06890421776e31de506 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
@@ -36,7 +36,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
index 5fc1195ecb62974794778f4dc543c901c31e0a4a..17747e4285ee9648ce4deb7fa6c46ba62bf22c2d 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
@@ -29,7 +29,7 @@ void getRowExample()
       return row.getValue( rowIdx );
    };
 
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
index b8ebf918175309adb754d2db35ade6d81e85bb2d..f5f662b89218d4001f614fbf25850fceb351eb7f 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
@@ -41,7 +41,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << *matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
index 346e331dba3284e798255d0b12e4d50aae8e2212..72d04c8ade019b8a3a72e82430d3a07443787ece 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
@@ -32,7 +32,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
index 4d3ae4ff518201ae4eee03a7ccac69ff6a16c423..b13ead12c5b479732f085aa734598ddfead8e6cb 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
@@ -36,7 +36,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
index 2b5f0faed2f8b81107f1a048fb053248b5f23480..85da6f5b3907ad0d85b11f889f3804991724c2b0 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
@@ -28,7 +28,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
index 30bf9249eccc5149db46af640f8ecfb58bdb04fc..8b11bdb17cf6415956cbc8ec23c9218e5274521a 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
@@ -40,7 +40,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << *matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
index 20d55ff1216e20a5c943d72919aa13e51e353240..073fbc9096c755d55247ea1bfc1d4e6db0ccd97b 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
@@ -30,7 +30,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, view.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, view.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
index fda9a41b995585d38b2a0067c1de3b3136578136..f6a54481adf3db4f70fb33db03dcc250f6ff693a 100644
--- a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
@@ -6,5 +6,5 @@ void scalarProduct( double* v1, double* v2, double* product, const int size )
     }
     auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
         return a + b; };
-    TNL::Algorithms::Reduction< Device >::reduce( 0, size, fetch, reduce, 0.0 );
+    TNL::Algorithms::reduce< Device >( 0, size, fetch, reduce, 0.0 );
 }
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
index ef17140ce0acdaf39ef772481a9b4728d638127e..7c7993e3a08ad099f685cd1ce0acf424dd6ae413 100644
--- a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
@@ -8,5 +8,5 @@ void scalarProduct( double* u1, double* u2,
     }
     auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
         return a + b; };
-    TNL::Algorithms::Reduction< Device >::reduce( 0, size, fetch, reduce, 0.0 );
+    TNL::Algorithms::reduce< Device >( 0, size, fetch, reduce, 0.0 );
 }
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 92686e17b10fa7ca0188aa0262e41252718f6f5a..594ebd8cd53b91eb12a871a5c70787992b9a0fb5 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -1,45 +1,49 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( SumExample SumExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SumExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SumExample.out OUTPUT SumExample.out )
-   CUDA_ADD_EXECUTABLE( ProductExample ProductExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ProductExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ProductExample.out OUTPUT ProductExample.out )
-   CUDA_ADD_EXECUTABLE( ScalarProductExample ScalarProductExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ScalarProductExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ScalarProductExample.out OUTPUT ScalarProductExample.out )
-   CUDA_ADD_EXECUTABLE( MaximumNormExample MaximumNormExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MaximumNormExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MaximumNormExample.out OUTPUT MaximumNormExample.out )
-   CUDA_ADD_EXECUTABLE( ComparisonExample ComparisonExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ComparisonExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ComparisonExample.out OUTPUT ComparisonExample.out )
-   CUDA_ADD_EXECUTABLE( UpdateAndResidueExample UpdateAndResidueExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND UpdateAndResidueExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UpdateAndResidueExample.out OUTPUT UpdateAndResidueExample.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-1 MapReduceExample-1.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-1 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-1.out OUTPUT MapReduceExample-1.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-2 MapReduceExample-2.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-2.out OUTPUT MapReduceExample-2.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-3 MapReduceExample-3.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-3.out OUTPUT MapReduceExample-3.out )
-   CUDA_ADD_EXECUTABLE( ReductionWithArgument ReductionWithArgument.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ReductionWithArgument > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ReductionWithArgument.out OUTPUT ReductionWithArgument.out )
-   CUDA_ADD_EXECUTABLE( ScanExample ScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ScanExample.out OUTPUT ScanExample.out )
-   CUDA_ADD_EXECUTABLE( ExclusiveScanExample ExclusiveScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ExclusiveScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ExclusiveScanExample.out OUTPUT ExclusiveScanExample.out )
-   CUDA_ADD_EXECUTABLE( SegmentedScanExample SegmentedScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SegmentedScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SegmentedScanExample.out OUTPUT SegmentedScanExample.out )
-ENDIF()
+set( COMMON_EXAMPLES
+     SumExample
+     SumExampleWithFunctional
+     ProductExample
+     ScalarProductExample
+     ScalarProductWithFunctionalExample
+     MaximumNormExample
+     ComparisonExample
+     UpdateAndResidueExample
+     MapReduceExample-1
+     MapReduceExample-2
+     MapReduceExample-3
+     ReductionWithArgument
+     ReductionWithArgumentWithFunctional
+     ScanExample
+     ExclusiveScanExample
+     SegmentedScanExample
+)
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      #add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      #add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
 
 IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( TutorialsReduction-cuda ALL DEPENDS
-   SumExample.out
-   ProductExample.out
-   ScalarProductExample.out
-   MaximumNormExample.out
-   ComparisonExample.out
-   UpdateAndResidueExample.out
-   MapReduceExample-1.out
-   MapReduceExample-2.out
-   MapReduceExample-3.out
-   ReductionWithArgument.out
-   ScanExample.out
-   ExclusiveScanExample.out
-   SegmentedScanExample.out )
-ENDIF()
+   ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
index 931d07d2b532bdb3bce834a557377dcf98220296..8972af7f44813a92969cb8ca5e1925002a15274c 100644
--- a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
@@ -21,8 +21,8 @@ bool comparison( const Vector< double, Device >& u, const Vector< double, Device
    /***
     * Reduce performs logical AND on intermediate results obtained by fetch.
     */
-   auto reduce = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
-   return Reduction< Device >::reduce( 0, v_view.getSize(), fetch, reduce, true );
+   auto reduction = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
+   return reduce< Device >( 0, v_view.getSize(), fetch, reduction, true );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
index 90a069c8a9286b1bf7a6262d7ead7d60f2e7ae50..ff02f9c86f800b70e45f80690c1eb6b54c37da6f 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
@@ -13,8 +13,8 @@ double mapReduce( Vector< double, Device >& u )
    auto u_view = u.getView();
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       return u_view[ i ] > 0 ? u_view[ i ] : 0.0; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
index da7c1c9c6cc8d690a8fec45ad43f54a51cbeab3b..065f4608ad1194a1c6867857567a2d67cef37bb7 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
@@ -15,8 +15,8 @@ double mapReduce( Vector< double, Device >& u )
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       if( i % 2 == 0 ) return u_view[ i ];
       return 0.0; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
@@ -27,7 +27,7 @@ int main( int argc, char* argv[] )
    timer.start();
    double result = mapReduce( host_u );
    timer.stop();
-   std::cout << "Host tesult is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "Host tesult is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #ifdef HAVE_CUDA
    Vector< double, Devices::Cuda > cuda_u( 100000 );
    cuda_u = 1.0;
@@ -35,7 +35,7 @@ int main( int argc, char* argv[] )
    timer.start();
    result = mapReduce( cuda_u );
    timer.stop();
-   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
index 5b5f31131cac0c90dcaaa783c80acd51018e711c..f3c54f6b0f66a9cbb353021a2858311c405ff1fd 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
@@ -14,8 +14,8 @@ double mapReduce( Vector< double, Device >& u )
    auto u_view = u.getView();
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       return u_view[ 2 * i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize() / 2, fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize() / 2, fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
@@ -26,7 +26,7 @@ int main( int argc, char* argv[] )
    timer.start();
    double result = mapReduce( host_u );
    timer.stop();
-   std::cout << "Host result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "Host result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #ifdef HAVE_CUDA
    Vector< double, Devices::Cuda > cuda_u( 100000 );
    cuda_u = 1.0;
@@ -34,7 +34,7 @@ int main( int argc, char* argv[] )
    timer.start();
    result = mapReduce( cuda_u );
    timer.stop();
-   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
index 8d503cbd4a7ed079dfff6b1d81511dc3ebc357bc..c9a5926ad741dd85f971af34374e45549b4b10a3 100644
--- a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
@@ -12,8 +12,8 @@ double maximumNorm( const Vector< double, Device >& v )
 {
    auto view = v.getConstView();
    auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return max( a, b ); };
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return max( a, b ); };
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
index 9df9a6e4b533d9b1669d80802d3eb6a38944d274..389ecd4975f21a1fedbd994fb11c4564c3018b3a 100644
--- a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
@@ -12,12 +12,12 @@ double product( const Vector< double, Device >& v )
 {
    auto view = v.getConstView();
    auto fetch = [=] __cuda_callable__ ( int i ) { return view[ i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
 
    /***
     * Since we compute the product of all elements, the reduction must be initialized by 1.0 not by 0.0.
     */
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 1.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 1.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
index 18ac3363bf4632b07f35404d5d887c6ed7637e9c..79a82c7334270e5b4cc3360f09b7d2761bc0f65f 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
@@ -22,7 +22,7 @@ maximumNorm( const Vector< double, Device >& v )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   return Reduction< Device >::reduceWithArgument( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::max() );
+   return reduceWithArgument< Device >( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::max() );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7b084db0eaa80805c6c54cdc7beb0828c7907715
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+std::pair< double, int >
+maximumNorm( const Vector< double, Device >& v )
+{
+   auto view = v.getConstView();
+
+   auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
+   return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   Vector< double, Devices::Host > host_v( 10 );
+   host_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = i - 7; } );
+   std::cout << "host_v = " << host_v << std::endl;
+   auto maxNormHost = maximumNorm( host_v );
+   std::cout << "The maximum norm of the host vector elements is " <<  maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl;
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   cuda_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = i - 7; } );
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   auto maxNormCuda = maximumNorm( cuda_v );
+   std::cout << "The maximum norm of the device vector elements is " <<  maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu
new file mode 120000
index 0000000000000000000000000000000000000000..a546b63396d2db82157b0e332177c2539ecbbead
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu
@@ -0,0 +1 @@
+ReductionWithArgumentWithFunctional.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
index 680075f8426d8d7fe35292cbc18cc818cfbeb6d9..2dd84aa03e55159ecb130fbf51c004f68c49eb93 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
@@ -17,8 +17,8 @@ double scalarProduct( const Vector< double, Device >& u, const Vector< double, D
     * Fetch computes product of corresponding elements of both vectors.
     */
    auto fetch = [=] __cuda_callable__ ( int i ) { return u_view[ i ] * v_view[ i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, v_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, v_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4838f5f77e98515b1b20845ea1f2ee0e626c9109
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+double scalarProduct( const Vector< double, Device >& u, const Vector< double, Device >& v )
+{
+   auto u_view = u.getConstView();
+   auto v_view = v.getConstView();
+
+   /***
+    * Fetch computes product of corresponding elements of both vectors.
+    */
+   return reduce< Device >(
+      0, v_view.getSize(),
+      [=] __cuda_callable__ ( int i ) { return u_view[ i ] * v_view[ i ]; },
+      TNL::Plus{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * The first test on CPU ...
+    */
+   Vector< double, Devices::Host > host_u( 10 ), host_v( 10 );
+   host_u = 1.0;
+   host_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = 2 * ( i % 2 ) - 1; } );
+   std::cout << "host_u = " << host_u << std::endl;
+   std::cout << "host_v = " << host_v << std::endl;
+   std::cout << "The scalar product ( host_u, host_v ) is " << scalarProduct( host_u, host_v ) << "." << std::endl;
+   std::cout << "The scalar product ( host_v, host_v ) is " << scalarProduct( host_v, host_v ) << "." << std::endl;
+
+   /***
+    * ... the second test on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_u( 10 ), cuda_v( 10 );
+   cuda_u = 1.0;
+   cuda_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = 2 * ( i % 2 ) - 1; } );
+   std::cout << "cuda_u = " << cuda_u << std::endl;
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   std::cout << "The scalar product ( cuda_u, cuda_v ) is " << scalarProduct( cuda_u, cuda_v ) << "." << std::endl;
+   std::cout << "The scalar product ( cuda_v, cuda_v ) is " << scalarProduct( cuda_v, cuda_v ) << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu
new file mode 120000
index 0000000000000000000000000000000000000000..8eef06256f09b944a15469371a62c4fa90765265
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu
@@ -0,0 +1 @@
+ScalarProductWithFunctionalExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
index 3dbd8581d1932933e20f3011b226fe1f3ce9bcf6..5281bfd5460711944fd0bbcbdf867d679e8e7954 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
@@ -12,14 +12,14 @@ void scan( Vector< double, Device >& v )
    /***
     * Reduction is sum of two numbers.
     */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
     * As parameters, we pass vector on which the scan is to be performed, interval
     * where the scan is performed, lambda function which is used by the scan and
     * zero element (idempotent) of the 'sum' operation.
     */
-   Scan< Device >::perform( v, 0, v.getSize(), reduce, 0.0 );
+   Scan< Device >::perform( v, 0, v.getSize(), reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
@@ -44,5 +44,4 @@ int main( int argc, char* argv[] )
    std::cout << "The prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
 #endif
    return EXIT_SUCCESS;
-}
-
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp b/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
index 0932b8b181fda7646937afb3102a5ea2601b0a2c..377040c76071f92fc7f2780f73c0cb7ba1f7ef0a 100644
--- a/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
@@ -1,11 +1,11 @@
 double sequentialSum( const double* a, const int size )
 {
    auto fetch = [=] (int i)->double { return a[ i ]; };
-   auto reduce = [] (double& x, const double& y) { return x + y; };
+   auto reduction = [] (double& x, const double& y) { return x + y; };
 
    double sum( 0.0 );
    for( int i = 0; i < size; i++ )
-       sum = reduce( sum, fetch( i ) );
+       sum = reduction( sum, fetch( i ) );
    return sum;
 }
 
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
index 90c6f724a7106f18f9ea87f0eb9807c2d264c349..cfa6e1befd8c75322139d5dcdf9a6558caced9ce 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
@@ -23,14 +23,14 @@ double sum( const Vector< double, Device >& v )
    /***
     * Reduction is sum of two numbers.
     */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
     * Finally we call the templated function Reduction and pass number of elements to reduce,
     * lambdas defined above and finally value of idempotent element, zero in this case, which serve for the
     * reduction initiation.
     */
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 0.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9ef7795cdd80e8ba70216028566b2a35cdaa04aa
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+double sum( const Vector< double, Device >& v )
+{
+   /****
+    * Get vector view which can be captured by lambda.
+    */
+   auto view = v.getConstView();
+
+   /****
+    * The fetch function just reads elements of vector v.
+    */
+   auto fetch = [=] __cuda_callable__ ( int i ) -> double { return view[ i ]; };
+
+   /***
+    * Finally we call the templated function Reduction and pass number of elements to reduce,
+    * lambda defined above and functional representing the reduction operation.
+    */
+   return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the sum with vectors allocated on CPU.
+    */
+   Vector< double, Devices::Host > host_v( 10 );
+   host_v = 1.0;
+   std::cout << "host_v = " << host_v << std::endl;
+   std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   cuda_v = 1.0;
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu
new file mode 120000
index 0000000000000000000000000000000000000000..c95dde139185a5740dc27370f2672beccb9be253
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu
@@ -0,0 +1 @@
+SumExampleWithFunctional.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
index 8bd08e900dcf9dfb0924e3665ac0211037fa135f..a2ccb8189993580c35cc31e5350b503d0bf4f7f4 100644
--- a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
@@ -16,8 +16,8 @@ double updateAndResidue( Vector< double, Device >& u, const Vector< double, Devi
       const double& add = delta_u_view[ i ];
       u_view[ i ] += tau * add;
       return add * add; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return sqrt( Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 ) );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return sqrt( reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 ) );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 32d663bd3e7bbdf4f79eea2b0992adc89473fccd..35246fe4ef75bcf8af3fc3528ecd711b878f5321 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -14,11 +14,11 @@ We will explain the *flexible parallel reduction* on several examples. We start
 
 We start with simple problem of computing sum of sequence of numbers \f[ s = \sum_{i=1}^n a_i. \f] Sequentialy, such sum can be computed very easily as follows:
 
-\include SequentialSum.cpp
+\includelineno SequentialSum.cpp
 
 Doing the same in CUDA for GPU is, however, much more difficult (see. [Optimizing Parallel Reduction in CUDA](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)). The final code has tens of lines and it is something you do not want to write again and again anytime you need to sum a series of numbers. Using TNL and C++ lambda functions we may do the same on few lines of code efficiently and independently on the hardware beneath. Let us first rewrite the previous example using the C++ lambda functions:
 
-\include SequentialSumWithLambdas.cpp
+\includelineno SequentialSumWithLambdas.cpp
 
 As can be seen, we split the reduction into two steps:
 
@@ -26,15 +26,15 @@ As can be seen, we split the reduction into two steps:
    1. Connect the reduction algorithm with given input arrays or vectors (or any other data structure).
    2. Perform operation you need to do with the input data.
    3. Perform another secondary operation simoultanously with the parallel reduction.
-2. `reduce` is operation we want to do after the data fetch. Usually it is summation, multiplication, evaluation of minimum or maximum or some logical operation.
+2. `reduction` is operation we want to do after the data fetch. Usually it is summation, multiplication, evaluation of minimum or maximum or some logical operation.
 
 Putting everything together gives the following example:
 
-\include SumExample.cpp
+\includelineno SumExample.cpp
 
 Since TNL vectors cannot be pass to CUDA kernels and so they cannot be captured by CUDA lambdas, we must first get vector view from the vector using a method `getConstView()`.
 
-Note tha we pass `0.0` as the last argument of the method `Reduction< Device >::reduce`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
+Note tha we pass `0.0` as the last argument of the template function `reduce< Device >`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
 
 \include SumExample.out
 
@@ -42,9 +42,9 @@ Sum of vector elements can be also obtained as [`sum(v)`](../html/namespaceTNL.h
 
 ### Product
 
-To demonstrate the effect of the *idempotent element*, we will now compute product of all elements of the vector. The *idempotent element* is one for multiplication and we also need to replace `a+b` with `a*b` in the definition of `reduce`. We get the following code:
+To demonstrate the effect of the *idempotent element*, we will now compute product of all elements of the vector. The *idempotent element* is one for multiplication and we also need to replace `a+b` with `a*b` in the definition of `reduction`. We get the following code:
 
-\include ProductExample.cpp
+\includelineno ProductExample.cpp
 
 leading to output like this:
 
@@ -56,7 +56,7 @@ Product of vector elements can be computed using fuction [`product(v)`](../html/
 
 One of the most important operation in the linear algebra is the scalar product of two vectors. Compared to coputing the sum of vector elements we must change the function `fetch` to read elements from both vectors and multiply them. See the following example.
 
-\include ScalarProductExample.cpp
+\includelineno ScalarProductExample.cpp
 
 The result is:
 
@@ -64,11 +64,11 @@ The result is:
 
 Scalar product of vectors `u` and `v` in TNL can be computed by \ref TNL::dot "TNL::dot(u, v)" or simply as \ref TNL::Containers::operator, "(u, v)".
 
-### Maxium norm
+### Maximum norm
 
-Maximum norm of a vector equals modulus of the vector largest element.  Therefore, `fetch` must return the absolute value of the vector elements and `reduce` wil return maximum of given values. Look at the following example.
+Maximum norm of a vector equals modulus of the vector largest element.  Therefore, `fetch` must return the absolute value of the vector elements and `reduction` wil return maximum of given values. Look at the following example.
 
-\include MaximumNormExample.cpp
+\includelineno MaximumNormExample.cpp
 
 The output is:
 
@@ -78,9 +78,9 @@ Maximum norm in TNL is computed by the function \ref TNL::maxNorm.
 
 ### Vectors comparison
 
-Comparison of two vectors involve (parallel) reduction as well. The `fetch` part is responsible for comparison of corresponding vector elements result of which is boolean `true` or `false` for each vector elements. The `reduce` part must perform logical and operation on all of them. We must not forget to change the *idempotent element* to `true`. The code may look as follows:
+Comparison of two vectors involve (parallel) reduction as well. The `fetch` part is responsible for comparison of corresponding vector elements result of which is boolean `true` or `false` for each vector elements. The `reduction` part must perform logical and operation on all of them. We must not forget to change the *idempotent element* to `true`. The code may look as follows:
 
-\include ComparisonExample.cpp
+\includelineno ComparisonExample.cpp
 
 And the output looks as:
 
@@ -96,7 +96,7 @@ In iterative solvers we often need to update a vector and compute the update nor
 
 Together with the vector addition, we may want to compute also \f$L_2\f$-norm of \f$\Delta \bf u\f$ which may indicate convergence. Computing first the addition and then the norm would be inefficient because we would have to fetch the vector \f$\Delta \bf u\f$ twice from the memory. The following example shows how to do the addition and norm computation at the same time.
 
-\include UpdateAndResidueExample.cpp
+\includelineno UpdateAndResidueExample.cpp
 
 The result reads as:
 
@@ -112,7 +112,7 @@ return u_view[ i ] > 0.0 ? u_view[ i ] : 0.0;
 
 to sum up only the positive numbers in the vector.
 
-\include MapReduceExample-1.cpp
+\includelineno MapReduceExample-1.cpp
 
 The result is:
 
@@ -120,7 +120,7 @@ The result is:
 
 Take a look at the following example where the filtering depends on the element indexes rather than values:
 
-\include MapReduceExample-2.cpp
+\includelineno MapReduceExample-2.cpp
 
 The result is:
 
@@ -134,28 +134,54 @@ return u_view[ 2 * i ];
 
 See the following example and compare the execution times.
 
-\include MapReduceExample-3.cpp
+\includelineno MapReduceExample-3.cpp
 
 \include MapReduceExample-3.out
 
 ### Reduction with argument
 
-In some situations we may need to locate given element in the vector. For example index of the smallest or the largest element. `reductionWithArgument` is a function which can do it. In the following example, we modify function for computing the maximum norm of a vedctor. Instead of just computing the value, now we want to get index of the element having the absolute value equal to the max norm. The lambda function `reduction` do not compute only maximum of two given elements anymore, but it must also compute index of the winner. See the following code:
+In some situations we may need to locate given element in the vector. For example index of the smallest or the largest element. `reduceWithArgument` is a function which can do it. In the following example, we modify function for computing the maximum norm of a vector. Instead of just computing the value, now we want to get index of the element having the absolute value equal to the max norm. The lambda function `reduction` do not compute only maximum of two given elements anymore, but it must also compute index of the winner. See the following code:
 
-\include ReductionWithArgument.cpp
+\includelineno ReductionWithArgument.cpp
 
 The definition of the lambda function `reduction` reads as:
 
 ```
-auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b );
+auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx );
 ```
 
-In addition to vector elements valuesd `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
+In addition to vector elements values `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
 
 The result looks as:
 
 \include ReductionWithArgument.out
 
+### Using functionals for reduction
+
+You might notice, that the lambda function `reduction` does not take so many different form compared to fetch. In addition, setting the zero (or idempotent) element can be annoying especially when computing minimum or maximum and we need to check std::limits function to make the code working with any type. To make things simpler, TNL offers variants of several functionals known from STL. They can be used instead of the lambda function `reduction` and they also carry the idempotent element. See the following example showing the scalar product of two vectors, now with functional:
+
+\includelineno ScalarProductWithFunctionalExample.cpp
+
+
+This example also shows more compact how to evoke the function `reduce` (lines 19-22). This way, one should be able to perform (parallel) reduction very easily. The result looks as follows:
+
+\include ScalarProductWithFunctionalExample.out
+
+In \ref TNL/Functionals.h you may find probably all operations that can be reasonably used for reduction:
+
+| Functional                      | Reduction operation      |
+|---------------------------------|--------------------------|
+| \ref TNL::Plus                  | Sum                      |
+| \ref TNL::Multiplies            | Product                  |
+| \ref TNL::Min                   | Minimum                  |
+| \ref TNL::Max                   | Maximum                  |
+| \ref TNL::MinWithArg            | Minimum with argument    |
+| \ref TNL::MaxWithArg            | Maximum with argument    |
+| \ref TNL::LogicalAnd            | Logical AND              |
+| \ref TNL::LogicalOr             | Logical OR               |
+| \ref TNL::BitAnd                | Bit AND                  |
+| \ref TNL::BitOr                 | Bit OR                   |
+
 ## Flexible scan
 
 ### Inclusive and exclusive scan
@@ -192,7 +218,7 @@ and exclusive prefix sum of the same sequence is
 
 Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on sumation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in simillar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
 
-\include ScanExample.cpp
+\includelineno ScanExample.cpp
 
 Scan does not use `fetch` function because the scan must be performed on a vector (the first parameter we pass to the scan). Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the vector elements before the scan, this can be done explicitly and it will not affect the performance significantlty. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element if the operation.
 
@@ -203,12 +229,12 @@ The result looks as:
 Exclusive scan works the same way, we just need to specify it by the second template parameter which is set to `ScanType::Exclusive`. The call of the scan then looks as
 
 ```
-Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduce, 0.0 );
+Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduction, 0.0 );
 ```
 
 The complete example looks as follows:
 
-\include ExclusiveScanExample.cpp
+\includelineno ExclusiveScanExample.cpp
 
 And the result looks as:
 
@@ -242,8 +268,8 @@ In addition to common scan, we need to encode the segments of the input sequence
 ```
 **Note: Segmented scan is not implemented for CUDA yet.**
 
-\include SegmentedScanExample.cpp
+\includelineno SegmentedScanExample.cpp
 
 The result reads as:
 
-\include SegmentedScanExample.out
+\include SegmentedScanExample.out
\ No newline at end of file
diff --git a/Documentation/Tutorials/Vectors/Expressions.cpp b/Documentation/Tutorials/Vectors/Expressions.cpp
index b41d2188eadf3ccbdd8317634669f962d47bf5a9..b261b4470c28be3c06fa0803b4bb51ee8982a035 100644
--- a/Documentation/Tutorials/Vectors/Expressions.cpp
+++ b/Documentation/Tutorials/Vectors/Expressions.cpp
@@ -10,16 +10,12 @@ void expressions()
 {
    using RealType = float;
    using VectorType = Vector< RealType, Device >;
-   using ViewType = VectorView< RealType, Device >;
 
    /****
     * Create vectors
     */
    const int size = 11;
-   VectorType a_v( size ), b_v( size ), c_v( size );
-   ViewType a = a_v.getView();
-   ViewType b = b_v.getView();
-   ViewType c = c_v.getView();
+   VectorType a( size ), b( size ), c( size );
    a.forAllElements( [] __cuda_callable__ ( int i, RealType& value ) { value = 3.14 * ( i - 5.0 ) / 5.0; } );
    b = a * a;
    c = 3 * a + sign( a ) * sin( a );
diff --git a/Documentation/Tutorials/Vectors/tutorial_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 5ac66ccd74830db226f30c28bd0591e0c0226925..5cd3b60f1ad27a335edbc29288fb2988dccb9ef5 100644
--- a/Documentation/Tutorials/Vectors/tutorial_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -20,24 +20,73 @@ This tutorial introduces vectors in TNL. `Vector`, in addition to `Array`, offer
 
 By *horizontal* operations we mean vector expressions where we have one or more vectors as an input and a vector as an output. In TNL, this kind of operations is performed by the [Expression Templates](https://en.wikipedia.org/wiki/Expression_templates). It makes algebraic operations with vectors easy to do and very efficient at the same time. In some cases, one get even more efficient code compared to [Blas](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [cuBlas](https://developer.nvidia.com/cublas). See the following example.
 
-\include Expressions.cpp
+\includelineno Expressions.cpp
 
 Output is:
 
 \include Expressions.out
 
-Vector expressions work only with `VectorView` not with `Vector`. The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like `min`, `max`, `abs`, `sin`, `cos`, `exp`, `log`, `sqrt`, `pow` etc.
+The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like the following:
+
+| Expression                     | Meaning                                                     |
+|--------------------------------|-------------------------------------------------------------|
+| `v = TNL::min( expr1, expr2 )` |  `v[ i ] = min( expr1[ i ], expr2[ i ] )`                   |
+| `v = TNL::max( expr1, expr2 )` |  `v[ i ] = max( expr1[ i ], expr2[ i ] )`                   |
+| `v =  TNL::abs( expr )`        |  `v[ i ] = abs( expr[ i ] )`                                |
+| `v =  TNL::sin( expr )`        |  `v[ i ] = sin( expr[ i ] )`                                |
+| `v =  TNL::cos( expr )`        |  `v[ i ] = cos( expr[ i ] )`                                |
+| `v =  TNL::tan( expr )`        |  `v[ i ] = tan( expr[ i ] )`                                |
+| `v =  TNL::asin( expr )`       |  `v[ i ] = asin( expr[ i ] )`                               |
+| `v =  TNL::acos( expr )`       |  `v[ i ] = acos( expr[ i ] )`                               |
+| `v =  TNL::atan( expr )`       |  `v[ i ] = atan( expr[ i ] )`                               |
+| `v =  TNL::sinh( expr )`       |  `v[ i ] = sinh( expr[ i ] )`                               |
+| `v =  TNL::cosh( expr )`       |  `v[ i ] = cosh( expr[ i ] )`                               |
+| `v =  TNL::tanh( expr )`       |  `v[ i ] = tanh( expr[ i ] )`                               |
+| `v =  TNL::asinh( expr )`      |  `v[ i ] = asinh( expr[ i ] )`                              |
+| `v =  TNL::acosh( expr )`      |  `v[ i ] = acosh( expr[ i ] )`                              |
+| `v =  TNL::atanh( expr )`      |  `v[ i ] = atanh( expr[ i ] )`                              |
+| `v =  TNL::exp( expr )`        |  `v[ i ] = exp( expr[ i ] )`                                |
+| `v =  TNL::log( expr )`        |  `v[ i ] = log( expr[ i ] )`                                |
+| `v =  TNL::log10( expr )`      |  `v[ i ] = log10( expr[ i ] )`                              |
+| `v =  TNL::log2( expr )`       |  `v[ i ] = log2( expr[ i ] )`                               |
+| `v =  TNL::sqrt( expr )`       |  `v[ i ] = sqrt( expr[ i ] )`                               |
+| `v =  TNL::cbrt( expr )`       |  `v[ i ] = cbrt( expr[ i ] )`                               |
+| `v =  TNL::pow( expr )`        |  `v[ i ] = pow( expr[ i ] )`                                |
+| `v =  TNL::floor( expr )`      |  `v[ i ] = floor( expr[ i ] )`                              |
+| `v =  TNL::ceil( expr )`       |  `v[ i ] = ceil( expr[ i ] )`                               |
+| `v =  TNL::sign( expr )`       |  `v[ i ] = sign( expr[ i ] )`                               |
+
+Where `v` is a result vector and `expr`, `expr1` and `expr2` are vector expressions. Vector expressions can be combined with vector views (\ref TNL::Containers::VectorView) as well.
 
 ### Vertical operations
 
 By *vertical operations* we mean (parallel) reduction based operations where we have one vector expressions as an input and one value as an output. For example computing scalar product, vector norm or finding minimum or maximum of vector elements is based on reduction. See the following example.
 
-\include Reduction.cpp
+\includelineno Reduction.cpp
 
 Output is:
 
 \include Reduction.out
 
+The following table shows vertical operations that can be used on vector expressions:
+
+| Expression                                   | Meaning                                                                                            |
+|----------------------------------------------|----------------------------------------------------------------------------------------------------|
+| `v =  TNL::min( expr )`                      | `v` is minimum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                       |
+| `std::pair( v, i ) =  TNL::argMin( expr )`   | `v` is minimum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`, `i` is index of the smallest element. |
+| `v =  TNL::max( expr )`                      | `v` is maximum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                       |
+| `std::pair( v, i ) =  TNL::argMax( expr )`   | `v` is maximum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`, `i` is index of the largest element.  |
+| `v =  TNL::sum( expr )`                      | `v` is sum of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                          |
+| `v =  TNL::maxNorm( expr )`                  | `v` is maximal norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                 |
+| `v =  TNL::l1Norm( expr )`                   | `v` is l1 norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::l2Norm( expr )`                   | `v` is l2 norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::lpNorm( expr, p )`                | `v` is lp norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::product( expr )`                  | `v` is product of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::logicalAnd( expr )`               | `v` is logical AND of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                  |
+| `v =  TNL::logicalOr( expr )`                | `v` is logical OR of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                   |
+| `v =  TNL::binaryAnd( expr )`                | `v` is binary AND of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                   |
+| `v =  TNL::binaryOr( expr )`                 | `v` is binary OR of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                    |
+
 ## Static vectors
 
 Static vectors are derived from static arrays and so they are allocated on the stack and can be created in CUDA kernels as well. Their size is fixed as well and it is given by a template parameter. Static vector is a templated class defined in namespace `TNL::Containers` having two template parameters:
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index d6a459677deec7e2a78cf3bbf2e12a1e8c46ecd9..72c1f344dcc843344600eaefaac9bd35a7d1f010 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -30,7 +30,7 @@ getVectorMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -47,7 +47,7 @@ getVectorMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -64,7 +64,7 @@ getVectorAbsMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -81,7 +81,7 @@ getVectorAbsMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -113,7 +113,7 @@ getVectorL2Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
-   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
+   return std::sqrt( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -136,7 +136,7 @@ getVectorLpNorm( const Vector& v,
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
-   return std::pow( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -155,7 +155,7 @@ getVectorSum( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -175,7 +175,7 @@ getVectorDifferenceMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -195,7 +195,7 @@ getVectorDifferenceMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -215,7 +215,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -235,7 +235,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -254,7 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -276,7 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       auto diff = data1[ i ] - data2[ i ];
       return diff * diff;
    };
-   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
+   return std::sqrt( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -302,7 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
-   return std::pow( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -321,7 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -340,7 +340,7 @@ getScalarProduct( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 } // namespace Benchmarks
diff --git a/src/Examples/flow-sw/navierStokesProblem_impl.h b/src/Examples/flow-sw/navierStokesProblem_impl.h
index e42c80894ee62126104c4e17bc872212f8c4bde7..45eab8db9debe35d208c59e0b6051074bd88cb32 100644
--- a/src/Examples/flow-sw/navierStokesProblem_impl.h
+++ b/src/Examples/flow-sw/navierStokesProblem_impl.h
@@ -177,7 +177,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -194,7 +197,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/flow-vl/navierStokesProblem_impl.h b/src/Examples/flow-vl/navierStokesProblem_impl.h
index e42c80894ee62126104c4e17bc872212f8c4bde7..45eab8db9debe35d208c59e0b6051074bd88cb32 100644
--- a/src/Examples/flow-vl/navierStokesProblem_impl.h
+++ b/src/Examples/flow-vl/navierStokesProblem_impl.h
@@ -177,7 +177,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -194,7 +197,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/flow/navierStokesProblem_impl.h b/src/Examples/flow/navierStokesProblem_impl.h
index c4c5795c8eb38f28ccd91f12f73b21b322dc2cb2..69c226bad5286764252582e52b4e18b04068f582 100644
--- a/src/Examples/flow/navierStokesProblem_impl.h
+++ b/src/Examples/flow/navierStokesProblem_impl.h
@@ -189,7 +189,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   /*FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -206,7 +209,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow-sw/eulerProblem_impl.h b/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
index f56fb295a9406b719e1b5d72cffc904c98bbe93e..6195a18de590ac0c636b07923a38624ffbe9e5b0 100644
--- a/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
@@ -174,7 +174,11 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -191,7 +195,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow-vl/eulerProblem_impl.h b/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
index f56fb295a9406b719e1b5d72cffc904c98bbe93e..e921eaab232287351073ad90c369024bd2e61f4b 100644
--- a/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
@@ -174,7 +174,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   /*FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -191,7 +194,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow/eulerProblem_impl.h b/src/Examples/inviscid-flow/eulerProblem_impl.h
index 5a7a42d1e9b3964b022a78bb89daa1b89f2a7d6d..50bcfbaecc650df09f59e7adb112dedb99692210 100644
--- a/src/Examples/inviscid-flow/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow/eulerProblem_impl.h
@@ -175,7 +175,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+  TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -189,7 +192,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "energy-" );
    this->conservativeVariables->getEnergy()->write( "energy", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 545192dfaf21bcb2baa70812e6f5a86744b1b00b..626847eba38d4a944796a5989b870c7a44515d43 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -182,7 +182,7 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
 template< typename Element,
@@ -198,7 +198,7 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, (Index) 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
 }
 
 template< typename Element,
@@ -214,7 +214,7 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index 0034b8302c98cf1b657b76941f6dd05935161dd0..abebd9d156b8d2bbd27c566d65fe799f8a040b8c 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -161,7 +161,7 @@ compare( const DestinationElement* destination,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [destination, source] ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
    }
    else {
       // sequential algorithm can return as soon as it finds a mismatch
@@ -183,7 +183,7 @@ containsValue( const Element* data,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
    }
    else {
       // sequential algorithm can return as soon as it finds a match
@@ -205,7 +205,7 @@ containsOnlyValue( const Element* data,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
    }
    else {
       // sequential algorithm can return as soon as it finds a mismatch
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index d928ec6875e6a39bb855ae29961ef89d6b358b89..ad982258861f7b11c8d4983d6db8b4636f96c8cf 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -13,14 +13,16 @@
 #pragma once
 
 #include <utility>  // std::pair
-#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
+#include <functional>        // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
+#include <TNL/Functional.h>  // modification of STL functionals made more suitable reduction
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/detail/Reduction.h>
 
 namespace TNL {
-namespace Algorithms {
+   namespace Algorithms {
 
 /**
  * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
@@ -31,322 +33,232 @@ namespace Algorithms {
  * position of the smallest or the largest element, reduction with argument can be used.
  *
  * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduce is a lambda function performing the reduction.
  *
- * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithLambda.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithLambda.out
  */
-template< typename Device >
-struct Reduction;
-
-template<>
-struct Reduction< Devices::Sequential >
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduce >
+Result reduce( const Index begin,
+               const Index end,
+               Fetch&& fetch,
+               Reduce&& reduce,
+               const Result& zero )
 {
-   /**
-    * \brief Computes reduction on CPU sequentialy.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static constexpr Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero );
+    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
+}
 
-   /**
-    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static constexpr std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero );
-};
-
-template<>
-struct Reduction< Devices::Host >
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduce with functional instead of reduction lambda function.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduce is a functional performing the reduction.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduce can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
+ *    \ref TNL::LogicalOr, \ref TNL::BitAnd or \ref TNL::BitOr.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithFunctional.out
+ */
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduce >
+auto reduce( const Index begin,
+             const Index end,
+             Fetch&& fetch,
+             Reduce&& reduce ) -> decltype( detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ),
+                                                   std::remove_reference< Reduce >::type::template getIdempotent< decltype( fetch( ( Index ) 0 ) ) >() ) )
 {
-   /**
-    * \brief Computes reduction on CPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero );
+   using Result = decltype( fetch( ( Index ) 0 ) );
+   return detail::Reduction< Device >::reduce( begin,
+                                               end,
+                                               std::forward< Fetch >( fetch ),
+                                               std::forward< Reduce >( reduce ),
+                                               reduce.template getIdempotent< Result >() );
+}
 
-   /**
-    * \brief Computes reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam ReductionOperation is a lambda function performing the reduction.
-    * \tparam DataFetcher is a lambda function for fetching the input data.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero );
-};
-
-template<>
-struct Reduction< Devices::Cuda >
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduce returning also a position of an element of interest.
+ *
+ * For example in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. The use of this method
+ * is, however, more flexible.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduce is a lambda function performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgument.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgument.out
+ */
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduce >
+std::pair< Result, Index >
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduce&& reduce,
+                    const Result& zero )
 {
-   /**
-    * \brief Computes reduction on GPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero );
+    return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                            end,
+                                                            std::forward< Fetch >( fetch ),
+                                                            std::forward< Reduce >( reduce ),
+                                                            zero );
+}
 
-   /**
-    * \brief Computes reduction on GPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero );
-};
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduceWithArgument with functional instead of reduction lambda function.
+ *
+ * For example in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. The use of this method
+ * is, however, more flexible.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduce is a functional performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduce can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgumentWithFunctional.out
+ */
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduce >
+auto
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduce&& reduce ) -> decltype( detail::Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce,
+                                                   std::remove_reference< Reduce >::type::template getIdempotent<  decltype( fetch( ( Index ) 0 ) ) >() ) )
+{
+   using Result = decltype( fetch( ( Index ) 0 ) );
+   return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                           end,
+                                                           std::forward< Fetch >( fetch ),
+                                                           std::forward< Reduce >( reduce ),
+                                                           reduce.template getIdempotent< Result >() );
+}
 
-} // namespace Algorithms
+   } // namespace Algorithms
 } // namespace TNL
-
-#include <TNL/Algorithms/Reduction.hpp>
diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
new file mode 100644
index 0000000000000000000000000000000000000000..998fa7ff5684ea99d512a6c7906d70bd5fb5f6c2
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -0,0 +1,244 @@
+/***************************************************************************
+                          Reduction.h  -  description
+                             -------------------
+    begin                : Jul 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>            // std::pair
+#include <functional>         // reduction functions like std::plus, std::logical_and, std::logical_or etc. - deprecated
+
+#include <TNL/Functional.h>   // replacement of STL functional
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace detail {
+
+/**
+ * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
+ *
+ * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
+ * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
+ * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
+ * position of the smallest or the largest element, reduction with argument can be used.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ *
+ * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
+ */
+template< typename Device >
+struct Reduction;
+
+template<>
+struct Reduction< Devices::Sequential >
+{
+   using DeviceType = Devices::Sequential;
+
+   /**
+    * \brief Computes reduction on CPU sequentially.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static constexpr Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero );
+
+   /**
+    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static constexpr std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero );
+};
+
+template<>
+struct Reduction< Devices::Host >
+{
+   using DeviceType = Devices::Host;
+
+   /**
+    * \brief Computes reduction on CPU.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero );
+
+   /**
+    * \brief Computes reduction on CPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam ReductionOperation is a lambda function performing the reduction.
+    * \tparam DataFetcher is a lambda function for fetching the input data.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero );
+};
+
+template<>
+struct Reduction< Devices::Cuda >
+{
+   using DeviceType = Devices::Cuda;
+
+   /**
+    * \brief Computes reduction on GPU.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero );
+
+   /**
+    * \brief Computes reduction on GPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero );
+};
+
+      } // namespace detail
+   } // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/detail/Reduction.hpp>
diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp
similarity index 99%
rename from src/TNL/Algorithms/Reduction.hpp
rename to src/TNL/Algorithms/detail/Reduction.hpp
index d7602f9defd3da6e43c3c40291e19c7e676ad7b0..0d1c8231f02507a963652b11194c684a0d299088 100644
--- a/src/TNL/Algorithms/Reduction.hpp
+++ b/src/TNL/Algorithms/detail/Reduction.hpp
@@ -26,7 +26,8 @@
 #endif
 
 namespace TNL {
-namespace Algorithms {
+   namespace Algorithms {
+      namespace detail {
 
 /****
  * Arrays smaller than the following constant
@@ -499,5 +500,6 @@ reduceWithArgument( const Index begin,
    }
 }
 
-} // namespace Algorithms
+      } // namespace detail
+   } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 77c85c750c08779d3ac65abd538ee8094553e2f5..f2c9ca705465c56287f397bb1417777f6f12f4ae 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -966,6 +966,13 @@ File& operator>>( File& file, Array< Value, Device, Index, Allocator >& array );
 template< typename Value, typename Device, typename Index, typename Allocator >
 File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array );
 
+template< typename Value, typename Device, typename Index, typename Allocator >
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
+
+template< typename Value, typename Device, typename Index, typename Allocator >
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index 53dd302efc83b7578b97e2034c52e75b460e76ff..d935840ff8d5524be68ff94fc70b674a903af7b3 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -911,5 +911,26 @@ File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array )
    return f >> array;
 }
 
+template< typename Value, typename Device, typename Index, typename Allocator >
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag, MPI_Comm comm )
+{
+   send( array.getConstView(), dest, tag, comm );
+}
+
+template< typename Value, typename Device, typename Index, typename Allocator >
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag, MPI_Comm comm )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( false, "Does not work" );
+   MPI_Status status;
+   Index size;
+   MPI_Recv( ( void* ) size, 1, MPI::getDataType< Index >(), src, tag, comm, &status );
+   std::cerr << "Size = " << size << std::endl;
+   array.setSize( size );
+   MPI_Recv( ( void* ) array.getData(), size * sizeof( Value ), MPI_BYTE, src, tag, comm, &status );
+#endif
+}
+
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index eb7e548b074ef26225de8382fa8345107e87e0b2..31743c1f60ac4f6ed243b06e1e856c21565bf11e 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -18,6 +18,7 @@
 #include <TNL/File.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/MPI.h>
 
 namespace TNL {
 namespace Containers {
@@ -767,6 +768,10 @@ File& operator>>( File& file, ArrayView< Value, Device, Index > view );
 template< typename Value, typename Device, typename Index >
 File& operator>>( File&& file, ArrayView< Value, Device, Index > view );
 
+template< typename Value, typename Device, typename Index >
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 753e737e32ae691bcf56424e15646fd7805ee962..8f6b446fe4e28673f704afb01ef51bcecb7aabe6 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -389,7 +389,7 @@ reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce,
 
    ValueType* d = this->getData();
    auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( begin, end, main_fetch, reduce, zero );
+   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
 }
 
 template< typename Value,
@@ -407,7 +407,7 @@ reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce,
 
    const ValueType* d = this->getData();
    auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( begin, end, main_fetch, reduce, zero );
+   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
 }
 
 template< typename Value,
@@ -540,5 +540,17 @@ File& operator>>( File&& file, ArrayView< Value, Device, Index > view )
    return f >> view;
 }
 
+template< typename Value, typename Device, typename Index >
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag, MPI_Comm comm )
+{
+#ifdef HAVE_MPI
+   TNL_ASSERT_TRUE( false, "Does not work" );
+   auto size = view.getSize();
+   MPI_Send( ( const void* ) size, 1, MPI::getDataType< Index >(), dest, tag, comm );
+   MPI_Send( ( const void* ) view.getData(), view.getSize() * sizeof( Value ), MPI_BYTE, dest, tag, comm );
+#endif
+}
+
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 3947bfec438a31307b32241a4bebc9e6a4324ab7..15d8eaa53c98271d4a17019c000cfc70b14dd9ea 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -193,6 +193,67 @@ public:
    template< typename Array >
    bool operator!=( const Array& array ) const;
 
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f );
+
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end) for constant instances of the array.
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+
+
    // Checks if there is an element with given value in this array
    bool containsValue( ValueType value ) const;
 
@@ -215,6 +276,16 @@ private:
    static void setSynchronizerHelper( ViewType& view, const Array& array ) {}
 };
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+std::ostream& operator<<( std::ostream& str, const DistributedArray< Value, Device, Index, Allocator >& array )
+{
+   return array.getConstView().print( str );
+}
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index e9ee120932070bfb7cb57e1e65ecd38da1cd01ce..dcfaeee2d01929e7d339eda4d681a7907aaf0439 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -449,6 +449,30 @@ operator!=( const Array& array ) const
    return view != array;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Function >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+forElements( IndexType begin, IndexType end, Function&& f )
+{
+   this->view.forElements( begin, end, f );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Function >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
+{
+   this->view.forElements( begin, end, f );
+}
+
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index cb3235ddbb746acf1149a697beaf49b16b39b1aa..9da306744f3e25f660b4fe80065308aae25b3b68 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -170,12 +170,73 @@ public:
    template< typename Array >
    bool operator!=( const Array& array ) const;
 
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f );
+
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end) for constant instances of the array.
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+
    // Checks if there is an element with given value in this array
    bool containsValue( ValueType value ) const;
 
    // Checks if all elements in this array have the same given value
    bool containsOnlyValue( ValueType value ) const;
 
+   std::ostream& print( std::ostream& str ) const;
 protected:
    LocalRangeType localRange;
    IndexType ghosts = 0;
@@ -187,6 +248,16 @@ protected:
    int valuesPerElement = 1;
 };
 
+
+template< typename Value,
+          typename Device = Devices::Host,
+          typename Index = int >
+std::ostream& operator<<( std::ostream& str, const DistributedArrayView< Value, Device, Index >& view )
+{
+   return view.print( str );
+}
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 65ecc4101fc0258bec0635aa07b202e83c9f178d..223ea99c869c36b10f858769440b6eb322075515 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -435,6 +435,37 @@ operator!=( const Array& array ) const
    return ! (*this == array);
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+   template< typename Function >
+void
+DistributedArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f )
+{
+   IndexType localBegin = max( begin, localRange.getBegin() );
+   IndexType localEnd = min( end, localRange.getEnd() );
+   auto local_f = [=] __cuda_callable__ ( const IndexType& idx, ValueType& value ) mutable {
+      f( idx + localRange.getBegin(), value );
+   };
+   this->localData.forElements( localBegin - localRange.getBegin(),
+                                localEnd - localRange.getBegin(),
+                                local_f );
+
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+   template< typename Function >
+void
+DistributedArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
+{
+
+}
+
+
 template< typename Value,
           typename Device,
           typename Index >
@@ -465,5 +496,31 @@ containsOnlyValue( ValueType value ) const
    return result;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+std::ostream&
+DistributedArrayView< Value, Device, Index >::
+print( std::ostream& str ) const
+{
+   // The following does not work properly
+   /*if( MPI::GetRank( group ) == 0 )
+   {
+      str << "[ ";
+      for( IndexType i = 0; i < localData.getSize(); i++ )
+         str << ", " << localData.getElement( i );
+      for( int proc = 1; proc < MPI::GetSize( group ); proc++ )
+      {
+         Array< std::remove_const_t< Value >, Device, Index > localArray;
+         receive( localArray, proc, 0, group );
+         for( IndexType i = 0; i < localArray.getSize(); i++ )
+            str << ", " << localArray.getElement( i );
+      }
+      str << " ]";
+   }
+   else send( this->localData, 0, 0, this->group );*/
+   return str;
+}
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index 738409cc40d94959599d3ad8f8f3c83bb6277bac..65f299120f180efcf2ff5897b1319fef473c1c3f 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -68,7 +68,7 @@ struct VectorComparison< T1, T2, false >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] == view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -100,7 +100,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] > view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -115,7 +115,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] >= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -130,7 +130,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] < view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -145,7 +145,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] <= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -162,7 +162,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -177,7 +177,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -187,7 +187,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -197,7 +197,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -207,7 +207,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -224,7 +224,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] == b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -239,7 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] > b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -249,7 +249,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] >= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -259,7 +259,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] < b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -269,7 +269,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] <= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 6959a95fed7ececd17b330f0c720b2d1d4dc0904..e257399f6670fd81d155d51312a76ecce8eeb38c 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -1073,7 +1073,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1092,7 +1092,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -1118,7 +1118,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1141,7 +1141,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -1167,7 +1167,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1190,7 +1190,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index f1b380435a2501ac74791e3d0b896675d4845a67..e1f850013a492bc071ae38cf031a381c1ae6d7e2 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -70,7 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -129,7 +129,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 93d7e802d3cb627227156e5026a1404f7b57da7c..11b06e82269aed82ff76210d806e24f3341a1f5a 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -896,7 +896,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -915,7 +915,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -941,7 +941,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -964,7 +964,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -990,7 +990,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1013,7 +1013,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 6e5f5624b22934f4caeb22e303ab51ad98c27072..ff094e4ea9d29ef9eef3aee3461d0029662e5c35 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -43,7 +43,7 @@ auto ExpressionMin( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -65,7 +65,7 @@ auto ExpressionArgMin( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -85,7 +85,7 @@ auto ExpressionMax( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -107,7 +107,7 @@ auto ExpressionArgMax( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -119,7 +119,7 @@ auto ExpressionSum( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -131,7 +131,7 @@ auto ExpressionProduct( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
 }
 
 template< typename Expression >
@@ -145,7 +145,7 @@ auto ExpressionLogicalAnd( const Expression& expression )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -157,7 +157,7 @@ auto ExpressionLogicalOr( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -171,7 +171,7 @@ auto ExpressionBinaryAnd( const Expression& expression )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -183,7 +183,7 @@ auto ExpressionBinaryOr( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Containers/Subrange.h b/src/TNL/Containers/Subrange.h
index 17e02c45f96ff5be79bde0caf8692d25db75166e..9f95108fcc081706081b6e93774d3e449bf72d70 100644
--- a/src/TNL/Containers/Subrange.h
+++ b/src/TNL/Containers/Subrange.h
@@ -21,7 +21,7 @@
 namespace TNL {
 namespace Containers {
 
-// Specifies a subrange [begin, end) of a range [0, gloablSize).
+// Specifies a subrange [begin, end) of a range [0, globalSize).
 template< typename Index >
 class Subrange
 {
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..d87e078e39db9352a626e2af7cecbf61a48c53b5
--- /dev/null
+++ b/src/TNL/Functional.h
@@ -0,0 +1,162 @@
+/***************************************************************************
+                          Functional.h  -  description
+                             -------------------
+    begin                : Juyl 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <functional>
+#include <algorithm>
+#include <limits>
+
+namespace TNL {
+
+/**
+ * \brief Extension of \ref std::plus<void> for use with \ref TNL::Algorithms::reduce.
+ *
+ */
+struct Plus : public std::plus< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) 0; };
+};
+
+/**
+ * \brief Extension of std::multiplies for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct Multiplies : public std::multiplies< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) 1; };
+};
+
+/**
+ * \brief Extension of std::min for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct Min
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+
+   template< typename Value >
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs < rhs ? lhs : rhs; }
+};
+
+/**
+ * \brief Extension of std::max for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct Max
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+
+   template< typename Value >
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs > rhs ? lhs : rhs; }
+};
+
+/**
+ * \brief Extension of std::min for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct MinWithArg
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+
+   template< typename Value, typename Index >
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
+   {
+      if( lhs > rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
+/**
+ * \brief Extension of std::max for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct MaxWithArg
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+
+   template< typename Value, typename Index >
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
+   {
+      if( lhs < rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
+/**
+ * \brief Extension of std::logical_and for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct LogicalAnd : public std::logical_and< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) true; };
+};
+
+/**
+ * \brief Extension of std::logical_or for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct LogicalOr : public std::logical_or< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) false; };
+};
+
+/**
+ * \brief Extension of std::bit_and for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct BitAnd : public std::bit_and< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
+};
+
+/**
+ * \brief Extension of std::bit_or for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+struct BitOr : public std::bit_or< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return static_cast< T >( 0 ); };
+};
+
+} // namespace TNL
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 0bf262aa041523b298f1f207081847959da758e3..4a999d76b07ce9f4b1659609edafbc9683cca336 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -168,7 +168,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/Matrix.hpp b/src/TNL/Matrices/Matrix.hpp
index d057a3ecf9835458b99a2e07db620be724abfcd9..7e9a3722b016462ba5cc34dde3347c9e2ff800e4 100644
--- a/src/TNL/Matrices/Matrix.hpp
+++ b/src/TNL/Matrices/Matrix.hpp
@@ -85,7 +85,7 @@ Index Matrix< Real, Device, Index, RealAllocator >::getNonzeroElementsCount() co
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MatrixView.hpp b/src/TNL/Matrices/MatrixView.hpp
index 83563a82570a4f4b98e12b125c4d447f1492b982..287305b5cd7d5ba83f2c68f255507b1fe595445d 100644
--- a/src/TNL/Matrices/MatrixView.hpp
+++ b/src/TNL/Matrices/MatrixView.hpp
@@ -63,7 +63,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 7dadde222a41ec5832fa1bffebad8b86ba75427f..2b83fc87bc87b3ce6b1e18e137310b127f5dd4d5 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -173,7 +173,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 593c100a98f93730d7175f3aa0ed2f287e1bfd94..02ef757c2831f5b95b29ab54900962d51ab14d7c 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -191,7 +191,7 @@ getNonzeroElementsCount() const
       auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
          return ( columns_view[ i ] != paddingIndex );
       };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
    }
    else
    {
@@ -869,7 +869,7 @@ operator==( const Matrix& m ) const
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 3aa633776c7bbacc11318b9f3871b87027bbcca2..5e7bfe7567f67527d46667a7e8d4420a0dd8b31d 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -133,7 +133,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
index 0fc2bebafe3e5df7983c19f58872c993c4bb0892..b722535a516726ef40727ea77053885d311dff62 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
@@ -137,8 +137,8 @@ public:
       {
          return bool(tags_view[ entityIndex ] & EntityTags::GhostEntity);
       };
-      const GlobalIndexType boundaryEntities = Algorithms::Reduction< Device >::reduce( (GlobalIndexType) 0, tags.getSize(), is_boundary, std::plus<>{}, (GlobalIndexType) 0 );
-      const GlobalIndexType ghostEntities = Algorithms::Reduction< Device >::reduce( (GlobalIndexType) 0, tags.getSize(), is_ghost, std::plus<>{}, (GlobalIndexType) 0 );
+      const GlobalIndexType boundaryEntities = Algorithms::reduce< Device >( (GlobalIndexType) 0, tags.getSize(), is_boundary, std::plus<>{}, (GlobalIndexType) 0 );
+      const GlobalIndexType ghostEntities = Algorithms::reduce< Device >( (GlobalIndexType) 0, tags.getSize(), is_ghost, std::plus<>{}, (GlobalIndexType) 0 );
 
       interiorIndices.setSize( tags.getSize() - boundaryEntities );
       boundaryIndices.setSize( boundaryEntities );
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 1e4361f4931c3213cb5dd36c9bf23a5796f84df8..14a7d43ab35a0ea9d5dc26fa8e571cef97dfbe1b 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -4,6 +4,7 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
+         ReductionTest
          staticForTest
          unrolledForTest
 )
diff --git a/src/UnitTests/Algorithms/ReductionTest.cpp b/src/UnitTests/Algorithms/ReductionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4d630e5f91eda483499442377b6cae3e1a88c6e0
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.cpp
@@ -0,0 +1 @@
+#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.cu b/src/UnitTests/Algorithms/ReductionTest.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4d630e5f91eda483499442377b6cae3e1a88c6e0
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.cu
@@ -0,0 +1 @@
+#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..b880642b8c0abb5549dd6e8fc818ec8947c99cec
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -0,0 +1,246 @@
+/***************************************************************************
+                          ReductionTest.h  -  description
+                             -------------------
+    begin                : Jul 2, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Reduction.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+
+#ifdef HAVE_GTEST
+
+template< typename Device >
+void ReduceTest_sum()
+{
+   using Array = Containers::Array< int, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.setValue( 1 );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus{} );
+      EXPECT_EQ( res, size );
+   }
+}
+
+template< typename Device >
+void ReduceTest_min()
+{
+   using Array = Containers::Array< int, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+template< typename Device >
+void ReduceTest_max()
+{
+   using Array = Containers::Array< int, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max{} );
+      EXPECT_EQ( res, size );
+   }
+}
+
+template< typename Device >
+void ReduceTest_minWithArg()
+{
+   using Array = Containers::Array< int, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg{} );
+      EXPECT_EQ( res.first, 1 );
+      EXPECT_EQ( res.second, 0 );
+   }
+}
+
+template< typename Device >
+void ReduceTest_maxWithArg()
+{
+   using Array = Containers::Array< int, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg{} );
+      EXPECT_EQ( res.first, size );
+      EXPECT_EQ( res.second, size - 1 );
+   }
+}
+
+template< typename Device >
+void ReduceTest_logicalAnd()
+{
+   using Array = Containers::Array< bool, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd{} );
+      EXPECT_EQ( res, false );
+   }
+}
+
+template< typename Device >
+void ReduceTest_logicalOr()
+{
+   using Array = Containers::Array< bool, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr{} );
+      EXPECT_EQ( res, true );
+   }
+}
+
+template< typename Device >
+void ReduceTest_bitAnd()
+{
+   using Array = Containers::Array< char, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+template< typename Device >
+void ReduceTest_bitOr()
+{
+   using Array = Containers::Array< char, Device >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 << ( idx % 8 );} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr{} );
+      EXPECT_EQ( res, ( char ) 255 );
+   }
+}
+
+// test fixture for typed tests
+template< typename Device >
+class ReduceTest : public ::testing::Test
+{
+protected:
+   using DeviceType = Device;
+};
+
+// types for which ArrayTest is instantiated
+using DeviceTypes = ::testing::Types<
+   Devices::Host
+#ifdef HAVE_CUDA
+   ,Devices::Cuda
+#endif
+   >;
+
+TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
+
+TYPED_TEST( ReduceTest, sum )
+{
+   ReduceTest_sum< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, min )
+{
+   ReduceTest_min< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, max )
+{
+   ReduceTest_max< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, minWithArg )
+{
+   ReduceTest_minWithArg< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, maxWithArg )
+{
+   ReduceTest_maxWithArg< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, logicalAnd )
+{
+   ReduceTest_logicalAnd< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, logicalOr )
+{
+   ReduceTest_logicalOr< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, bitAnd )
+{
+   ReduceTest_bitAnd< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, bitOr )
+{
+   ReduceTest_bitOr< typename TestFixture::DeviceType >();
+}
+
+#endif
+
+#include "../main.h"
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index 04a3a4f00259e580d8f5cd0cabf9bcccb559e813..d50b682ba8a2284664c63a0c98767ae3faac44aa 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -6,7 +6,14 @@ ADD_SUBDIRECTORY( Functions )
 ADD_SUBDIRECTORY( Meshes )
 ADD_SUBDIRECTORY( Pointers )
 
-set( CPP_TESTS  AssertTest base64Test FileNameTest MathTest ObjectTest StringTest TimerTest TypeInfoTest )
+set( CPP_TESTS  AssertTest
+                base64Test
+                FileNameTest
+                MathTest
+                ObjectTest
+                StringTest
+                TimerTest
+                TypeInfoTest )
 set( CUDA_TESTS  AssertCudaTest )
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} AllocatorsTest FileTest )
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index c6dfa3842d88c3895156b61e5a71c162d05c8859..ef7d077a536d543f29a17f5c8086469139a572be 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -446,7 +446,7 @@ void test_SetElement()
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool {
       return ( v_view[ i ] == m_view.getElement( i, i ) );
    };
-   EXPECT_TRUE( TNL::Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, m.getRows(), fetch, std::logical_and<>{}, true ) );
+   EXPECT_TRUE( TNL::Algorithms::reduce< DeviceType >( ( IndexType ) 0, m.getRows(), fetch, std::logical_and<>{}, true ) );
 
 }