diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.h b/src/Benchmarks/BLAS/CommonVectorOperations.h
index 504da8fa6d06602382df07da05570ad08ff7aef0..ed29153688d0fce31621ee7fa95a6dc9b45174bf 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.h
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.h
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
-
 namespace TNL {
 namespace Benchmarks {
 
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index 727a0abcf008aec3e8654ff1a8ac88715e9fc280..640fda337b5d8a8a6dcec75f081702eccb45464c 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -11,7 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/PrefixSum.h>
 #include "CommonVectorOperations.h"
 
 namespace TNL {
@@ -30,9 +29,8 @@ getVectorMax( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -48,9 +46,8 @@ getVectorMin( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -66,9 +63,8 @@ getVectorAbsMax( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -84,9 +80,8 @@ getVectorAbsMin( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -102,9 +97,7 @@ getVectorL1Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -119,10 +112,8 @@ getVectorL2Norm( const Vector& v )
    using IndexType = typename Vector::IndexType;
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -144,10 +135,8 @@ getVectorLpNorm( const Vector& v,
       return getVectorL2Norm< Vector, ResultType >( v );
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -165,10 +154,8 @@ getVectorSum( const Vector& v )
    using IndexType = typename Vector::IndexType;
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -186,10 +173,9 @@ getVectorDifferenceMax( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -207,10 +193,9 @@ getVectorDifferenceMin( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -228,10 +213,9 @@ getVectorDifferenceAbsMax( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -249,10 +233,9 @@ getVectorDifferenceAbsMin( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -270,10 +253,8 @@ getVectorDifferenceL1Norm( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -295,9 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       auto diff = data1[ i ] - data2[ i ];
       return diff * diff;
    };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -322,10 +301,8 @@ getVectorDifferenceLpNorm( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -343,10 +320,8 @@ getVectorDifferenceSum( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -364,10 +339,8 @@ getScalarProduct( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 25fa055bf59afd1dcaebb85b94af7ab58479ec5b..80c63020dcfa27b498dbf4fe6da12cfe1501a9d9 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l3normCudaET = [&]() {
       resultDevice = lpNorm( deviceView, 3.0 );
    };
-
    benchmark.setOperation( "l3 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET );
@@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto scalarProductCudaET = [&]() {
       resultDevice = ( deviceView, deviceView2 );
    };
-
 #ifdef HAVE_BLAS
    auto scalarProductBlas = [&]() {
       resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
@@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
-   ////
-   // Prefix sum
-   /*
-   std::cout << "Benchmarking prefix-sum:" << std::endl;
-   timer.reset();
-   timer.start();
-   hostVector.computePrefixSum();
-   timer.stop();
-   timeHost = timer.getTime();
-   bandwidth = 2 * datasetSize / timer.getTime();
-   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-
-   timer.reset();
-   timer.start();
-   deviceVector.computePrefixSum();
-   timer.stop();
-   timeDevice = timer.getTime();
-   bandwidth = 2 * datasetSize / timer.getTime();
-   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
-
-   HostVector auxHostVector;
-   auxHostVector.setLike( deviceVector );
-   auxHostVector = deviceVector;
-   for( int i = 0; i < size; i++ )
-      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
-      {
-         std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
-      }
-   */
-
-
    ////
    // Scalar multiplication
    auto multiplyHost = [&]() {
@@ -435,6 +401,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto multiplyCuda = [&]() {
       deviceVector *= 0.5;
    };
+#ifdef HAVE_BLAS
+   auto multiplyBlas = [&]() {
+      blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto multiplyCublas = [&]() {
       const Real alpha = 0.5;
@@ -445,6 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
+#endif
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
@@ -606,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas );
 #endif
 
+   ////
+   // Inclusive prefix sum
+   auto inclusivePrefixSumHost = [&]() {
+      hostVector.prefixSum();
+   };
+   auto inclusivePrefixSumCuda = [&]() {
+      deviceVector.prefixSum();
+   };
+   benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda );
+#endif
+
+   ////
+   // Exclusive prefix sum
+   auto exclusivePrefixSumHost = [&]() {
+      hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
+   };
+   auto exclusivePrefixSumCuda = [&]() {
+      deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
+   };
+   benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
+#endif
+
 #ifdef HAVE_CUDA
    cublasDestroy( cublasHandle );
 #endif
diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 42b19814e64ad4b5c793752fbe7b5345d8860a0a..683a18376276c4b0dbd194329226e9a517a4af12 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -17,7 +17,6 @@
 #include "Logging.h"
 
 #include <iostream>
-#include <iomanip>
 #include <exception>
 #include <limits>
 
@@ -35,24 +34,24 @@ namespace Benchmarks {
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
 
-
 struct BenchmarkResult
 {
    using HeaderElements = Logging::HeaderElements;
    using RowElements = Logging::RowElements;
 
-   double bandwidth = std::numeric_limits<double>::quiet_NaN();
    double time = std::numeric_limits<double>::quiet_NaN();
+   double stddev = std::numeric_limits<double>::quiet_NaN();
+   double bandwidth = std::numeric_limits<double>::quiet_NaN();
    double speedup = std::numeric_limits<double>::quiet_NaN();
 
    virtual HeaderElements getTableHeader() const
    {
-      return HeaderElements({"bandwidth", "time", "speedup"});
+      return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
    }
 
    virtual RowElements getRowElements() const
    {
-      return RowElements({ bandwidth, time, speedup });
+      return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
    }
 };
 
@@ -65,18 +64,17 @@ public:
    using Logging::MetadataMap;
    using Logging::MetadataColumns;
    using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
-   
+
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logging(verbose), loops(loops)
    {}
-   
+
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< bool >( "reset", "Call reset function between loops.", true );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
-      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
@@ -85,7 +83,6 @@ public:
       this->loops = parameters.getParameter< int >( "loops" );
       this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
-      this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< int >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -96,7 +93,7 @@ public:
    {
       this->loops = loops;
    }
-   
+
    void setMinTime( const double& minTime )
    {
       this->minTime = minTime;
@@ -121,7 +118,6 @@ public:
       metadata["loops"] = convertToString(loops);
       metadata["reset"] = convertToString( reset );
       metadata["minimal test time"] = convertToString( minTime );
-      metadata["timing"] = convertToString( timing );
       writeMetadata( metadata );
    }
 
@@ -203,33 +199,22 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->timing )
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            if( this->reset )
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            if( this->timing )
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            if( this->reset )
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          this->performedLoops = functionTimer.getPerformedLoops();
       }
@@ -248,7 +233,7 @@ public:
       return this->baseTime;
    }
 
-   template< typename Device, 
+   template< typename Device,
              typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
@@ -272,21 +257,16 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->timing )
-               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
-            else
-               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            if( this->timing )
-               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
-            else
-               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -304,7 +284,7 @@ public:
       return this->baseTime;
    }
 
-   template< typename Device, 
+   template< typename Device,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -345,7 +325,6 @@ protected:
    double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
-   bool timing = true;
    bool reset = true;
    SolverMonitorType monitor;
 };
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index a76ebd5589d38aac5d53687f6c7e7f48e732f8ff..1edd6120476f5f50bf3eb714ca3ea1bd8a8ca4aa 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -17,6 +17,7 @@
 
 #include <TNL/Timer.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
 #include <TNL/Solvers/IterativeSolverMonitor.h>
 
 namespace TNL {
@@ -25,110 +26,94 @@ namespace Benchmarks {
 template< typename Device >
 class FunctionTimer
 {
-   public:
-      using DeviceType = Device;
-
-      template< bool timing,
-                typename ComputeFunction,
-                typename ResetFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      double
-      timeFunction( ComputeFunction compute,
-                    ResetFunction reset,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor(),
-                    bool performReset = true )
+public:
+   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
+   // computation times and stddev is the sample standard deviation
+   template< typename ComputeFunction,
+             typename ResetFunction,
+             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+   std::pair< double, double >
+   timeFunction( ComputeFunction compute,
+                 ResetFunction reset,
+                 int maxLoops,
+                 const double& minTime,
+                 int verbose = 1,
+                 Monitor && monitor = Monitor() )
+   {
+      // the timer is constructed zero-initialized and stopped
+      Timer timer;
+
+      // set timer to the monitor
+      if( verbose > 1 )
+         monitor.setTimer( timer );
+
+      // warm up
+      reset();
+      compute();
+
+      Containers::Vector< double > results( maxLoops );
+      results.setValue( 0.0 );
+
+      for( loops = 0;
+           loops < maxLoops || sum( results ) < minTime;
+           loops++ )
       {
-         // the timer is constructed zero-initialized and stopped
-         Timer timer;
-
-         // set timer to the monitor
-         if( verbose > 1 )
-            monitor.setTimer( timer );
-
-         // warm up
+         // abuse the monitor's "time" for loops
+         monitor.setTime( loops + 1 );
          reset();
-         compute();
 
-         // If we do not perform reset function and don't need
-         // the monitor, the timer is not interrupted after each loop.
-         if( ! performReset && verbose < 2 )
-         {
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-            if( timing )
-               timer.start();
-
-            for( loops = 0;
-                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops)
-               compute();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            if( timing )
-               timer.stop();
-         }
-         else
-         {
-            for( loops = 0;
-                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops) 
-            {
-               // abuse the monitor's "time" for loops
-               monitor.setTime( loops + 1 );
-               reset();
-
-               // Explicit synchronization of the CUDA device
+         // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
+         if( std::is_same< Device, Devices::Cuda >::value )
+            cudaDeviceSynchronize();
 #endif
-               if( timing )
-                  timer.start();
-               compute();
+
+         // reset timer before each computation
+         timer.reset();
+         timer.start();
+         compute();
 #ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
+         if( std::is_same< Device, Devices::Cuda >::value )
+            cudaDeviceSynchronize();
 #endif
-               if( timing )
-                  timer.stop();
-            }
-         }
-         if( timing )
-            return timer.getRealTime() / ( double ) loops;
-         else
-            return std::numeric_limits<double>::quiet_NaN();
-      }
+         timer.stop();
 
-      template< bool timing,
-                typename ComputeFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      double
-      timeFunction( ComputeFunction compute,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor() )
-      {
-         auto noReset = [] () {};
-         return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+         results[ loops ] = timer.getRealTime();
       }
 
-      int getPerformedLoops() const
-      {
-         return this->loops;
+      const double mean = sum( results ) / (double) loops;
+      if( loops > 1 ) {
+         const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean );
+         return std::make_pair( mean, stddev );
       }
-
-   protected:
-      int loops;
+      else {
+         const double stddev = std::numeric_limits<double>::quiet_NaN();
+         return std::make_pair( mean, stddev );
+      }
+   }
+
+   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
+   // computation times and stddev is the sample standard deviation
+   template< typename ComputeFunction,
+             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+   std::pair< double, double >
+   timeFunction( ComputeFunction compute,
+                 int maxLoops,
+                 const double& minTime,
+                 int verbose = 1,
+                 Monitor && monitor = Monitor() )
+   {
+      auto noReset = [] () {};
+      return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor );
+   }
+
+   int getPerformedLoops() const
+   {
+      return this->loops;
+   }
+
+protected:
+   int loops;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
index 33dff1ded6f5968f812236d2dd65b23a2f1c2fc9..944e7f73fe4d9cdf0eb12496b3dc1ee02ccdd5f7 100644
--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
@@ -259,12 +259,21 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
    const Index dofsCount = gridXSize * gridYSize;
    dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) );
    dim3 cudaUpdateBlockSize( 256 );
- 
+
    /****
     * Initiation
     */
+   // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+   // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])`
+   // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+   // Thus, the host compiler would spit out some warnings...
+   #ifdef __NVCC__
+   Real* u = new Real[ static_cast<const Index&>(dofsCount) ];
+   Real* aux = new Real[ static_cast<const Index&>(dofsCount) ];
+   #else
    Real* u = new Real[ dofsCount ];
    Real* aux = new Real[ dofsCount ];
+   #endif
    Real* max_du = new Real[ cudaUpdateBlocks.x ];
    if( ! u || ! aux )
    {
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index b10ab7199337b68fe5c51181b4924fcc8a3f89b6..61608d364e769fe4b6e68a0691957745ee805496 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -16,225 +16,224 @@
 #include <map>
 #include <vector>
 #include <iostream>
+#include <iomanip>
 #include <string>
 #include <sstream>
 
+#include <TNL/String.h>
+
 namespace TNL {
-   namespace Benchmarks {
+namespace Benchmarks {
 
 class Logging
 {
-   public:
-      using MetadataElement = std::pair< const char*, String >;
-      using MetadataMap = std::map< const char*, String >;
-      using MetadataColumns = std::vector<MetadataElement>;
-
-      using HeaderElements = std::vector< String >;
-      using RowElements = std::vector< double >;
-
-      Logging( int verbose = true )
-      : verbose(verbose)
-      {}
-
-      void
-      setVerbose( int verbose)
-      {
-         this->verbose = verbose;
-      }
-
-      void
-      writeTitle( const String & title )
-      {
-         if( verbose )
-            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-         log << ": title = " << title << std::endl;
-      }
-
-      void
-      writeMetadata( const MetadataMap & metadata )
-      {
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using HeaderElements = std::vector< String >;
+   using RowElements = std::vector< double >;
+
+   Logging( int verbose = true )
+   : verbose(verbose)
+   {}
+
+   void
+   setVerbose( int verbose)
+   {
+      this->verbose = verbose;
+   }
+
+   void
+   writeTitle( const String & title )
+   {
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+      log << ": title = " << title << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
          if( verbose )
-            std::cout << "properties:" << std::endl;
-
-         for( auto & it : metadata ) {
-            if( verbose )
-               std::cout << "   " << it.first << " = " << it.second << std::endl;
-            log << ": " << it.first << " = " << it.second << std::endl;
-         }
-         if( verbose )
-            std::cout << std::endl;
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+         log << ": " << it.first << " = " << it.second << std::endl;
       }
-
-      void
-      writeTableHeader( const String & spanningElement,
-                        const HeaderElements & subElements )
-      {
-         if( verbose && header_changed ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.first;
-            }
-
-            // spanning element is printed as usual column to stdout,
-            // but is excluded from header
-            std::cout << std::setw( 15 ) << "";
-
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 ) << it;
-            }
-            std::cout << std::endl;
-
-            header_changed = false;
-         }
-
-         // initial indent string
-         header_indent = "!";
-         log << std::endl;
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      if( verbose && header_changed ) {
          for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
+            std::cout << std::setw( 20 ) << it.first;
          }
 
-         // dump stacked spanning columns
-         if( horizontalGroups.size() > 0 )
-            while( horizontalGroups.back().second <= 0 ) {
-               horizontalGroups.pop_back();
-               header_indent.pop_back();
-            }
-         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-               header_indent += "!";
-            }
-         }
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
 
-         log << header_indent << " " << spanningElement << std::endl;
          for( auto & it : subElements ) {
-            log << header_indent << "! " << it << std::endl;
+            std::cout << std::setw( 15 ) << it;
          }
+         std::cout << std::endl;
 
-         if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second--;
-            header_indent.pop_back();
-         }
+         header_changed = false;
       }
 
-      void
-      writeTableRow( const String & spanningElement,
-                     const RowElements & subElements )
-      {
-         if( verbose ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.second;
-            }
-            // spanning element is printed as usual column to stdout
-            std::cout << std::setw( 15 ) << spanningElement;
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 );
-               if( it != 0.0 )std::cout << it;
-               else std::cout << "N/A";
-            }
-            std::cout << std::endl;
-         }
-
-         // only when changed (the header has been already adjusted)
-         // print each element on separate line
-         for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-         }
-
-         // benchmark data are indented
-         const String indent = "    ";
-         for( auto & it : subElements ) {
-            if( it != 0.0 ) log << indent << it << std::endl;
-            else log << indent << "N/A" << std::endl;
-         }
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
       }
 
-      void
-      writeErrorMessage( const char* msg,
-                         int colspan = 1 )
-      {
-         // initial indent string
-         header_indent = "!";
-         log << std::endl;
-         for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-         }
-
-         // make sure there is a header column for the message
-         if( horizontalGroups.size() == 0 )
-            horizontalGroups.push_back( {"", 1} );
-
-         // dump stacked spanning columns
+      // dump stacked spanning columns
+      if( horizontalGroups.size() > 0 )
          while( horizontalGroups.back().second <= 0 ) {
             horizontalGroups.pop_back();
             header_indent.pop_back();
          }
-         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-               header_indent += "!";
-            }
-         }
-         if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second -= colspan;
-            header_indent.pop_back();
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
          }
+      }
 
-         // only when changed (the header has been already adjusted)
-         // print each element on separate line
-         for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-         }
-         log << msg << std::endl;
+      log << header_indent << " " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << header_indent << "! " << it << std::endl;
       }
 
-      void
-      closeTable()
-      {
-         log << std::endl;
-         header_indent = body_indent = "";
-         header_changed = true;
-         horizontalGroups.clear();
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second--;
+         header_indent.pop_back();
       }
+   }
 
-      bool save( std::ostream & logFile )
-      {
-         closeTable();
-         logFile << log.str();
-         if( logFile.good() ) {
-            log.str() = "";
-            return true;
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 );
+            if( it != 0.0 )std::cout << it;
+            else std::cout << "N/A";
          }
-         return false;
+         std::cout << std::endl;
       }
 
-   protected:
-
-      // manual double -> String conversion with fixed precision
-      static String
-      _to_string( double num, int precision = 0, bool fixed = false )
-      {
-         std::stringstream str;
-         if( fixed )
-            str << std::fixed;
-         if( precision )
-            str << std::setprecision( precision );
-         str << num;
-         return String( str.str().data() );
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
       }
 
-      std::stringstream log;
-      std::string header_indent;
-      std::string body_indent;
-
-      int verbose;
-      MetadataColumns metadataColumns;
-      bool header_changed = true;
-      std::vector< std::pair< String, int > > horizontalGroups;
-};
+      // benchmark data are indented
+      const String indent = "    ";
+      for( auto & it : subElements ) {
+         if( it != 0.0 ) log << indent << it << std::endl;
+         else log << indent << "N/A" << std::endl;
+      }
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
 
+      // make sure there is a header column for the message
+      if( horizontalGroups.size() == 0 )
+         horizontalGroups.push_back( {"", 1} );
 
-   } // namespace Benchmarks
-} // namespace TNL
+      // dump stacked spanning columns
+      while( horizontalGroups.back().second <= 0 ) {
+         horizontalGroups.pop_back();
+         header_indent.pop_back();
+      }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second -= colspan;
+         header_indent.pop_back();
+      }
 
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+      log << msg << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+      log << std::endl;
+      header_indent = body_indent = "";
+      header_changed = true;
+      horizontalGroups.clear();
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      closeTable();
+      logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   int verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+};
 
+} // namespace Benchmarks
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 8623c99de0aa29946a1f5bd2c6282fd713b0a938..46618fcd7228c1f055a82fd92d36fc50ee852348 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -19,7 +19,6 @@
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 
 namespace TNL {
 namespace Containers {
@@ -133,10 +132,8 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true );
 }
 
 template< typename Element,
@@ -151,10 +148,8 @@ containsValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, (Index) 0, "" );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_or<>{}, fetch, false );
 }
 
 template< typename Element,
@@ -169,10 +164,8 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
index f7c164d066bf6b8d7ece3bfe32cf0759de646d6d..3a54a862f0e8d7c479a3c2fe1a42d536550ac49c 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
@@ -17,7 +17,6 @@
 #include <TNL/ParallelFor.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
index 5bf04dbaf5374cf564932f93b64aabdc5dba4a6a..4113bbcd90f0edce53d143cf65996a392c2a91b4 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
@@ -18,7 +18,6 @@
 #include <TNL/Exceptions/MICSupportMissing.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index e8fc7f8bb913dbc0e280f5d3c6b60d77498cd719..e67c11b419dca3ae83706d43c56f5f282aba4beb 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -12,14 +12,11 @@
 
 #pragma once
 
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
 namespace Containers {
@@ -32,157 +29,128 @@ namespace Algorithms {
  * architecture so that there are no local memory spills.
  */
 static constexpr int Multireduction_maxThreadsPerBlock = 256;  // must be a power of 2
-static constexpr int Multireduction_registersPerThread = 38;   // empirically determined optimal value
+static constexpr int Multireduction_registersPerThread = 32;   // empirically determined optimal value
 
 // __CUDA_ARCH__ is defined only in device code!
 #if (__CUDA_ARCH__ >= 300 )
-   static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
+   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
 #else
    static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
 #endif
 
-template< int blockSizeX, typename Operation, typename Index >
+template< int blockSizeX,
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
-CudaMultireductionKernel( Operation operation,
-                          const int n,
+CudaMultireductionKernel( const Result zero,
+                          DataFetcher dataFetcher,
+                          const Reduction reduction,
                           const Index size,
-                          const typename Operation::DataType1* input1,
-                          const Index ldInput1,
-                          const typename Operation::DataType2* input2,
-                          typename Operation::ResultType* output )
+                          const int n,
+                          Result* output )
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
-
-   /***
-    * Get thread id (tid) and global element id (gid).
-    * gridSizeX is the number of elements in the direction of x-axis
-    * processed by all blocks at the same time.
-    */
-   const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSizeX = blockDim.x * gridDim.x;
-
-   /***
-    * Shift input1 and output pointers.
-    */
-   const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
-   if( y < n ) {
-      input1 += y * ldInput1;
-      output += y * gridDim.x;
-   }
-   else
-      return;
-
-   /***
-    * Start with the sequential reduction and push the
-    * result into the shared memory.
-    */
-   sdata[ tid ] = operation.initialValue();
-   while( gid + 4 * gridSizeX < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+
+   // Get the thread id (tid), global thread id (gid) and gridSize.
+   const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
+         Index gid = blockIdx.x * blockDim.x + threadIdx.x;
+   const Index gridSizeX = blockDim.x * gridDim.x;
+
+   // Get the dataset index.
+   const int y = blockIdx.y * blockDim.y + threadIdx.y;
+   if( y >= n ) return;
+
+   sdata[ tid ] = zero;
+
+   // Start with the sequential reduction and push the result into the shared memory.
+   while( gid + 4 * gridSizeX < size ) {
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
       gid += 4 * gridSizeX;
    }
-   while( gid + 2 * gridSizeX < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
+   while( gid + 2 * gridSizeX < size ) {
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
       gid += 2 * gridSizeX;
    }
-   while( gid < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
+   while( gid < size ) {
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
       gid += gridSizeX;
    }
    __syncthreads();
 
-
-   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   /***
-    *  Perform the parallel reduction.
-    */
+   // Perform the parallel reduction.
    if( blockSizeX >= 1024 ) {
-      if( threadIdx.x < 512 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
-      }
+      if( threadIdx.x < 512 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSizeX >= 512 ) {
-      if( threadIdx.x < 256 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
-      }
+      if( threadIdx.x < 256 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSizeX >= 256 ) {
-      if( threadIdx.x < 128 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
-      }
+      if( threadIdx.x < 128 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
    }
    if( blockSizeX >= 128 ) {
-      if( threadIdx.x <  64 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
-      }
+      if( threadIdx.x <  64 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
    }
 
-   /***
-    * This runs in one warp so it is synchronized implicitly.
-    *
-    * When the blockSizeX is less then or equal to the warp size, the shared memory
-    * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory
-    * will be accessed !!!
-    */
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( threadIdx.x < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      if( blockSizeX >= 64 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
-      if( blockSizeX >= 32 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSizeX >= 16 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSizeX >=  8 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSizeX >=  4 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSizeX >=  2 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
+      if( blockSizeX >= 64 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
+      // Note that here we do not have to check if tid < 16 etc, because we have
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSizeX >= 32 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
+      if( blockSizeX >= 16 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
+      if( blockSizeX >=  8 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
+      if( blockSizeX >=  4 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
+      if( blockSizeX >=  2 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
-   /***
-    * Store the result back in the global memory.
-    */
+   // Store the result back in the global memory.
    if( threadIdx.x == 0 ) {
-      output[ blockIdx.x ] = sdata[ tid ];
+      output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ];
    }
 }
+#endif
 
-template< typename Operation, typename Index >
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
 int
-CudaMultireductionKernelLauncher( Operation& operation,
-                                  const int n,
+CudaMultireductionKernelLauncher( const Result zero,
+                                  DataFetcher dataFetcher,
+                                  const Reduction reduction,
                                   const Index size,
-                                  const typename Operation::DataType1* input1,
-                                  const Index ldInput1,
-                                  const typename Operation::DataType2* input2,
-                                  typename Operation::ResultType*& output )
+                                  const int n,
+                                  Result*& output )
 {
-   typedef typename Operation::ResultType ResultType;
-
+#ifdef HAVE_CUDA
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -197,7 +165,7 @@ CudaMultireductionKernelLauncher( Operation& operation,
                                       / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
    const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
    dim3 blockSize, gridSize;
-   
+
    // version A: max 16 rows of threads
    blockSize.y = TNL::min( n, 16 );
 
@@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation,
 
    // create reference to the reduction buffer singleton and set size
    // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
-   const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
+   const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result );
    CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
    cudaReductionBuffer.setSize( buf_size );
-   output = cudaReductionBuffer.template getData< ResultType >();
+   output = cudaReductionBuffer.template getData< Result >();
 
    // when there is only one warp per blockSize.x, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    const Index shmem = (blockSize.x <= 32)
-            ? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
-            : blockSize.x * blockSize.y * sizeof( ResultType );
-
-   //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem <<std::endl;
+            ? 2 * blockSize.x * blockSize.y * sizeof( Result )
+            : blockSize.x * blockSize.y * sizeof( Result );
 
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
+   // Depending on the blockSize we generate appropriate template instance.
    switch( blockSize.x )
    {
       case 512:
          CudaMultireductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   4 >
-        <<< gridSize, blockSize, shmem >>>( operation,  n,size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         break;
       case   2:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case   1:
          throw std::logic_error( "blockSize should not be 1." );
       default:
          throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
    }
+   cudaStreamSynchronize(0);
    TNL_CHECK_CUDA_DEVICE;
 
    // return the size of the output array on the CUDA device
    return gridSize.x;
-}
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
+}
 
 } // namespace Algorithms
 } // namespace Containers
diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 40a662b0856f726370537a8f9431e77126d3221f..ae3bb84de5c167bd40359908d60feb99f8947032 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -15,7 +15,6 @@
 #include <TNL/Math.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Containers/Array.h>
 
 namespace TNL {
@@ -25,24 +24,21 @@ namespace Algorithms {
 #ifdef HAVE_CUDA
 
 template< typename Real,
-          typename Operation,
-          typename VolatileOperation,
+          typename Reduction,
           typename Index >
 __global__ void
 cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
-                              Operation operation,
-                              VolatileOperation volatileOperation,
+                              Reduction reduction,
                               const Real zero,
                               const Index size,
-                              const Index elementsInBlock,
+                              const int elementsInBlock,
                               const Real* input,
                               Real* output,
-                              Real* auxArray,
-                              const Real gridShift )
+                              Real* auxArray )
 {
    Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
-   volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
-   volatile Real* warpSums = &auxData[ blockDim.x ];
+   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
+   Real* warpSums = &auxData[ blockDim.x ];
 
    const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
    const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock );
@@ -50,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    /***
     * Load data into the shared memory.
     */
-   const Index blockOffset = blockIdx.x * elementsInBlock;
-   Index idx = threadIdx.x;
+   const int blockOffset = blockIdx.x * elementsInBlock;
+   int idx = threadIdx.x;
    if( prefixSumType == PrefixSumType::Exclusive )
    {
       if( idx == 0 )
@@ -70,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
          idx += blockDim.x;
       }
    }
-   if( blockIdx.x == 0 && threadIdx.x == 0 )
-      operation( sharedData[ 0 ], gridShift );
 
    /***
     * Perform the sequential prefix-sum.
@@ -87,14 +81,15 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
          sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];
    }
 
-   Index chunkPointer( 1 );
+   int chunkPointer = 1;
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
-      operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                 sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
+      sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
+         reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
+                    sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
       auxData[ threadIdx.x ] =
-         sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer  ) ];
+         sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
       chunkPointer++;
    }
 
@@ -103,9 +98,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
     */
    const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize();
    const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize();
-   for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
+   for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
       if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
-         volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
+         auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
+      __syncwarp();
+   }
 
    if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 )
       warpSums[ warpIdx ] = auxData[ threadIdx.x ];
@@ -115,29 +112,32 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
     * Compute prefix-sum of warp sums using one warp
     */
    if( warpIdx == 0 )
-      for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
+      for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
-            volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
+            warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
+         __syncwarp();
+      }
    __syncthreads();
 
    /****
     * Shift the warp prefix-sums.
     */
    if( warpIdx > 0 )
-      volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
+      auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
+   __syncthreads();
 
    /***
     *  Store the result back in global memory.
     */
-   __syncthreads();
    idx = threadIdx.x;
    while( idx < elementsInBlock && blockOffset + idx < size )
    {
-      const Index chunkIdx = idx / chunkSize;
+      const int chunkIdx = idx / chunkSize;
       Real chunkShift( zero );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
-      operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
+      sharedData[ Devices::Cuda::getInterleaving( idx ) ] =
+         reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
       output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ];
       idx += blockDim.x;
    }
@@ -147,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    {
       if( prefixSumType == PrefixSumType::Exclusive )
       {
-         Real aux = zero;
-         operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
-         operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
-         auxArray[ blockIdx.x ] = aux;
+         auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ],
+                                             sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
       }
       else
          auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
@@ -158,26 +156,26 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
 }
 
 template< typename Real,
-          typename Operation,
+          typename Reduction,
           typename Index >
 __global__ void
-cudaSecondPhaseBlockPrefixSum( Operation operation,
+cudaSecondPhaseBlockPrefixSum( Reduction reduction,
                                const Index size,
-                               const Index elementsInBlock,
-                               Real gridShift,
+                               const int elementsInBlock,
+                               const Index gridIdx,
+                               const Index maxGridSize,
                                const Real* auxArray,
-                               Real* data )
+                               Real* data,
+                               Real shift )
 {
-   if( blockIdx.x > 0 )
+   if( gridIdx > 0 || blockIdx.x > 0 )
+      shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
+   const int readOffset = blockIdx.x * elementsInBlock;
+   int readIdx = threadIdx.x;
+   while( readIdx < elementsInBlock && readOffset + readIdx < size )
    {
-      const Real shift = auxArray[ blockIdx.x - 1 ];
-      const Index readOffset = blockIdx.x * elementsInBlock;
-      Index readIdx = threadIdx.x;
-      while( readIdx < elementsInBlock && readOffset + readIdx < size )
-      {
-         operation( data[ readIdx + readOffset ], shift );
-         readIdx += blockDim.x;
-      }
+      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
+      readIdx += blockDim.x;
    }
 }
 
@@ -186,191 +184,209 @@ template< PrefixSumType prefixSumType,
           typename Index >
 struct CudaPrefixSumKernelLauncher
 {
-   template< typename Operation,
-             typename VolatileOperation >
+   /****
+    * \brief Performs both phases of prefix sum.
+    *
+    * \param size  Number of elements to be scanned.
+    * \param deviceInput  Pointer to input data on GPU.
+    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param zero  Neutral element for given reduction operation, i.e. value such that
+    *              `reduction(zero, x) == x` for any `x`.
+    * \param blockSize  The CUDA block size to be used for kernel launch.
+    */
+   template< typename Reduction >
    static void
-   cudaRecursivePrefixSum( PrefixSumType prefixSumType_,
-                           Operation& operation,
-                           VolatileOperation& volatileOperation,
-                           const Real& zero,
-                           const Index size,
-                           const Index blockSize,
-                           const Index elementsInBlock,
-                           Real& gridShift,
-                           const Real* input,
-                           Real* output )
+   perform( const Index size,
+            const Real* deviceInput,
+            Real* deviceOutput,
+            Reduction& reduction,
+            const Real zero,
+            const int blockSize = 256 )
    {
+      const auto blockShifts = performFirstPhase(
+         size,
+         deviceInput,
+         deviceOutput,
+         reduction,
+         zero,
+         blockSize );
+      performSecondPhase(
+         size,
+         deviceOutput,
+         blockShifts.getData(),
+         reduction,
+         zero,
+         blockSize );
+   }
+
+   /****
+    * \brief Performs the first phase of prefix sum.
+    *
+    * \param size  Number of elements to be scanned.
+    * \param deviceInput  Pointer to input data on GPU.
+    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param zero  Neutral value for given reduction operation, i.e. value such that
+    *              `reduction(zero, x) == x` for any `x`.
+    * \param blockSize  The CUDA block size to be used for kernel launch.
+    */
+   template< typename Reduction >
+   static auto
+   performFirstPhase( const Index size,
+                      const Real* deviceInput,
+                      Real* deviceOutput,
+                      Reduction& reduction,
+                      const Real zero,
+                      const int blockSize = 256 )
+   {
+      // compute the number of grids
+      const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index auxArraySize = numberOfBlocks;
-
-      Array< Real, Devices::Cuda > auxArray1, auxArray2;
-      auxArray1.setSize( auxArraySize );
-      auxArray2.setSize( auxArraySize );
-
-      /****
-       * Setup block and grid size.
-       */
-      dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
-      cudaBlockSize.x = blockSize;
-      cudaGridSize.x = roundUpDivision( size, elementsInBlock );
-
-      /****
-       * Run the kernel.
-       */
-      const std::size_t sharedDataSize = elementsInBlock +
-                                         elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
-      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( Real );
-      cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-         ( prefixSumType_,
-           operation,
-           volatileOperation,
-           zero,
-           size,
-           elementsInBlock,
-           input,
-           output,
-           auxArray1.getData(),
-           gridShift );
-      TNL_CHECK_CUDA_DEVICE;
+      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
+      // allocate array for the block sums
+      Array< Real, Devices::Cuda > blockSums;
+      blockSums.setSize( numberOfBlocks );
 
-      //std::cerr << " auxArray1 = " << auxArray1 << std::endl;
-      /***
-       * In auxArray1 there is now a sum of numbers in each block.
-       * We must compute prefix-sum of auxArray1 and then shift
-       * each block.
-       */
-      Real gridShift2 = zero;
-      if( numberOfBlocks > 1 )
-         cudaRecursivePrefixSum( PrefixSumType::Inclusive,
-            operation,
-            volatileOperation,
-            zero,
-            numberOfBlocks,
-            blockSize,
-            elementsInBlock,
-            gridShift2,
-            auxArray1.getData(),
-            auxArray2.getData() );
-
-      //std::cerr << " auxArray2 = " << auxArray2 << std::endl;
-      cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
-         ( operation,
-           size,
-           elementsInBlock,
-           gridShift,
-           auxArray2.getData(),
-           output );
-      TNL_CHECK_CUDA_DEVICE;
+      // loop over all grids
+      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+         // compute current grid size and size of data to be scanned
+         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
+         Index currentSize = size - gridOffset;
+         if( currentSize / elementsInBlock > maxGridSize() )
+            currentSize = maxGridSize() * elementsInBlock;
+         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
+
+         // setup block and grid size
+         dim3 cudaBlockSize, cudaGridSize;
+         cudaBlockSize.x = blockSize;
+         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+
+         // run the kernel
+         const std::size_t sharedDataSize = elementsInBlock +
+                                            elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
+         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
+         cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+            ( prefixSumType,
+              reduction,
+              zero,
+              currentSize,
+              elementsInBlock,
+              &deviceInput[ gridOffset ],
+              &deviceOutput[ gridOffset ],
+              &blockSums[ gridIdx * maxGridSize() ] );
+      }
 
-      cudaMemcpy( &gridShift,
-                  &auxArray2[ auxArraySize - 1 ],
-                  sizeof( Real ),
-                  cudaMemcpyDeviceToHost );
-      //std::cerr << "gridShift = " << gridShift << std::endl;
+      // synchronize the null-stream after all grids
+      cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
+
+      // blockSums now contains sums of numbers in each block. The first phase
+      // ends by computing prefix-sum of this array.
+      if( numberOfBlocks > 1 ) {
+         CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform(
+            blockSums.getSize(),
+            blockSums.getData(),
+            blockSums.getData(),
+            reduction,
+            zero,
+            blockSize );
+      }
+
+      // Store the number of CUDA grids for the purpose of unit testing, i.e.
+      // to check if we test the algorithm with more than one CUDA grid.
+      gridsCount() = numberOfGrids;
+
+      // blockSums now contains shift values for each block - to be used in the second phase
+      return blockSums;
    }
 
    /****
-    * \brief Starts prefix sum in CUDA.
+    * \brief Performs the seocond phase of prefix sum.
     *
-    * \tparam Operation operation to be performed on particular elements - addition usually
-    * \tparam VolatileOperation - volatile version of Operation
-    * \param size is number of elements to be scanned
-    * \param blockSize is CUDA block size
-    * \param deviceInput is pointer to input data on GPU
-    * \param deviceOutput is pointer to resulting array, can be the same as input
-    * \param operation is instance of Operation
-    * \param volatileOperation is instance of VolatileOperation
-    * \param zero is neutral element for given Operation
+    * \param size  Number of elements to be scanned.
+    * \param deviceOutput  Pointer to output array on GPU.
+    * \param blockShifts  Pointer to a GPU array containing the block shifts. It is the
+    *                     result of the first phase.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param shift  A constant shifting all elements of the array (usually `zero`, i.e.
+    *               the neutral value).
+    * \param blockSize  The CUDA block size to be used for kernel launch.
     */
-   template< typename Operation,
-             typename VolatileOperation >
+   template< typename Reduction >
    static void
-   start( const Index size,
-      const Index blockSize,
-      const Real *deviceInput,
-      Real* deviceOutput,
-      Operation& operation,
-      VolatileOperation& volatileOperation,
-      const Real& zero )
+   performSecondPhase( const Index size,
+                       Real* deviceOutput,
+                       const Real* blockShifts,
+                       Reduction& reduction,
+                       const Real shift,
+                       const Index blockSize = 256 )
    {
-      /****
-       * Compute the number of grids
-       */
-      const Index elementsInBlock = 8 * blockSize;
+      // compute the number of grids
+      const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize();
-      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize );
-      Real gridShift = zero;
-      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
+      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
 
-      /****
-       * Loop over all grids.
-       */
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ )
-      {
-         /****
-          * Compute current grid size and size of data to be scanned
-          */
-         const Index gridOffset = gridIdx * maxGridSize * elementsInBlock;
+      // loop over all grids
+      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+         // compute current grid size and size of data to be scanned
+         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
          Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize )
-            currentSize = maxGridSize * elementsInBlock;
-
+         if( currentSize / elementsInBlock > maxGridSize() )
+            currentSize = maxGridSize() * elementsInBlock;
          //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
-         cudaRecursivePrefixSum( prefixSumType,
-            operation,
-            volatileOperation,
-            zero,
-            currentSize,
-            blockSize,
-            elementsInBlock,
-            gridShift,
-            &deviceInput[ gridOffset ],
-            &deviceOutput[ gridOffset ] );
-         TNL_CHECK_CUDA_DEVICE;
+
+         // setup block and grid size
+         dim3 cudaBlockSize, cudaGridSize;
+         cudaBlockSize.x = blockSize;
+         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+
+         // run the kernel
+         cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
+            ( reduction,
+              size,
+              elementsInBlock,
+              gridIdx,
+              (Index) maxGridSize(),
+              blockShifts,
+              &deviceOutput[ gridOffset ],
+              shift );
       }
 
-      /***
-       * Store the number of CUDA grids for a purpose of unit testing, i.e.
-       * to check if we test the algorithm with more than one CUDA grid.
-       */
-      gridsCount = numberOfGrids;
+      // synchronize the null-stream after all grids
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    }
 
    /****
     * The following serves for setting smaller maxGridSize so that we can force
     * the prefix sum in CUDA to run with more the one grids in unit tests.
     */
-   static void setMaxGridSize( int newMaxGridSize ) {
-      maxGridSize = newMaxGridSize;
+   static int& maxGridSize()
+   {
+      static int maxGridSize = Devices::Cuda::getMaxGridSize();
+      return maxGridSize;
    }
 
-   static void resetMaxGridSize() {
-      maxGridSize = Devices::Cuda::getMaxGridSize();
+   static void resetMaxGridSize()
+   {
+      maxGridSize() = Devices::Cuda::getMaxGridSize();
    }
 
-   static int maxGridSize;
-
-   static int gridsCount;
+   static int& gridsCount()
+   {
+      static int gridsCount = -1;
+      return gridsCount;
+   }
 };
 
-template< PrefixSumType prefixSumType,
-          typename Real,
-          typename Index >
-int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize();
-
-template< PrefixSumType prefixSumType,
-          typename Real,
-          typename Index >
-int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1;
-
-
 #endif
 
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
-
-
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 89e13a272dcf2253d8430dc73778257e85737174..ce278c7f1b9d8393afa86d848a9dddb2f52fd744 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -12,20 +12,17 @@
 
 #include <utility>  // std::pair
 
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
-#ifdef HAVE_CUDA
 /****
  * The performance of this kernel is very sensitive to register usage.
  * Compile with --ptxas-options=-v and configure these constants for given
@@ -34,6 +31,7 @@ namespace Algorithms {
 static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
 static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value
 
+#ifdef HAVE_CUDA
 // __CUDA_ARCH__ is defined only in device code!
 #if (__CUDA_ARCH__ >= 300 )
    static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
@@ -42,47 +40,42 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
 #endif
 
 template< int blockSize,
-   typename Result,
-   typename DataFetcher,
-   typename Reduction,
-   typename VolatileReduction,
-   typename Index >
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
                      const DataFetcher dataFetcher,
                      const Reduction reduction,
-                     const VolatileReduction volatileReduction,
                      const Index size,
                      Result* output )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
-   const IndexType tid = threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
+   const Index tid = threadIdx.x;
+         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const Index gridSize = blockDim.x * gridDim.x;
 
    sdata[ tid ] = zero;
 
-   // Read data into the shared memory. We start with the sequential reduction.
+   // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSize < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
@@ -90,49 +83,48 @@ CudaReductionKernel( const Result zero,
    // Perform the parallel reduction.
    if( blockSize >= 1024 ) {
       if( tid < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 ) {
       if( tid < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 ) {
       if( tid < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
    }
    if( blockSize >= 128 ) {
       if( tid <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
    }
 
-   // This runs in one warp so it is synchronized implicitly.
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( tid < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      if( blockSize >= 64 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
+      if( blockSize >= 64 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
-      // twice as much shared memory, so we do not access out of bounds. The
-      // results for the upper half will be undefined, but unused anyway.
-      if( blockSize >= 32 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSize >= 16 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSize >=  8 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSize >=  4 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSize >=  2 ) {
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSize >= 32 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
+      if( blockSize >= 16 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
+      if( blockSize >=  8 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
+      if( blockSize >=  4 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
+      if( blockSize >=  2 )
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
@@ -141,34 +133,29 @@ CudaReductionKernel( const Result zero,
 }
 
 template< int blockSize,
-   typename Result,
-   typename DataFetcher,
-   typename Reduction,
-   typename VolatileReduction,
-   typename Index >
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionWithArgumentKernel( const Result zero,
                                  const DataFetcher dataFetcher,
                                  const Reduction reduction,
-                                 const VolatileReduction volatileReduction,
                                  const Index size,
                                  Result* output,
                                  Index* idxOutput,
                                  const Index* idxInput = nullptr )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
-   IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] );
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+   Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
-   const IndexType tid = threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
+   const Index tid = threadIdx.x;
+         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const Index gridSize = blockDim.x * gridDim.x;
 
-   // Read data into the shared memory. We start with the sequential reduction.
+   // Start with the sequential reduction and push the result into the shared memory.
    if( idxInput ) {
       if( gid < size ) {
          sdata[ tid ] = dataFetcher( gid );
@@ -243,31 +230,29 @@ CudaReductionWithArgumentKernel( const Result zero,
       __syncthreads();
    }
 
-   // This runs in one warp so it is synchronized implicitly.
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( tid < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      volatile IndexType* vsidx = sidx;
-      if( blockSize >= 64 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
+      if( blockSize >= 64 )
+         reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
-      // twice as much shared memory, so we do not access out of bounds. The
-      // results for the upper half will be undefined, but unused anyway.
-      if( blockSize >= 32 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSize >= 16 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSize >=  8 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSize >=  4 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSize >=  2 ) {
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSize >= 32 )
+         reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
+      if( blockSize >= 16 )
+         reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
+      if( blockSize >=  8 )
+         reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
+      if( blockSize >=  4 )
+         reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
+      if( blockSize >=  2 )
+         reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
@@ -276,15 +261,13 @@ CudaReductionWithArgumentKernel( const Result zero,
       idxOutput[ blockIdx.x ] = sidx[ 0 ];
    }
 }
+#endif
 
 
 template< typename Index,
           typename Result >
 struct CudaReductionKernelLauncher
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -309,62 +292,56 @@ struct CudaReductionKernelLauncher
    }
 
    template< typename DataFetcher,
-             typename Reduction,
-             typename VolatileReduction >
+             typename Reduction >
    int start( const Reduction& reduction,
-              const VolatileReduction& volatileReduction,
               const DataFetcher& dataFetcher,
               const Result& zero,
-              ResultType*& output )
+              Result*& output )
    {
       // create reference to the reduction buffer singleton and set size
-      const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType );
+      const std::size_t buf_size = 2 * desGridSize * sizeof( Result );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
-      output = cudaReductionBuffer.template getData< ResultType >();
+      output = cudaReductionBuffer.template getData< Result >();
 
-      this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
+      this->reducedSize = this->launch( originalSize, reduction, dataFetcher, zero, output );
       return this->reducedSize;
    }
 
    template< typename DataFetcher,
-             typename Reduction,
-             typename VolatileReduction >
+             typename Reduction >
    int startWithArgument( const Reduction& reduction,
-                          const VolatileReduction& volatileReduction,
                           const DataFetcher& dataFetcher,
                           const Result& zero,
-                          ResultType*& output,
-                          IndexType*& idxOutput )
+                          Result*& output,
+                          Index*& idxOutput )
    {
       // create reference to the reduction buffer singleton and set size
-      const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
+      const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
-      output = cudaReductionBuffer.template getData< ResultType >();
-      idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] );
+      output = cudaReductionBuffer.template getData< Result >();
+      idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );
 
-      this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
+      this->reducedSize = this->launchWithArgument( originalSize, reduction, dataFetcher, zero, output, idxOutput, nullptr );
       return this->reducedSize;
    }
 
-   template< typename Reduction,
-             typename VolatileReduction >
+   template< typename Reduction >
    Result
    finish( const Reduction& reduction,
-           const VolatileReduction& volatileReduction,
            const Result& zero )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
-      ResultType* output = &input[ desGridSize ];
+      Result* input = cudaReductionBuffer.template getData< Result >();
+      Result* output = &input[ desGridSize ];
 
       while( this->reducedSize > 1 )
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
-         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
-         this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
+         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
+         this->reducedSize = this->launch( this->reducedSize, reduction, copyFetch, zero, output );
          std::swap( input, output );
       }
 
@@ -373,30 +350,28 @@ struct CudaReductionKernelLauncher
       std::swap( input, output );
 
       // Copy result on CPU
-      ResultType result;
+      Result result;
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
       return result;
    }
 
-   template< typename Reduction,
-             typename VolatileReduction >
+   template< typename Reduction >
    std::pair< Index, Result >
    finishWithArgument( const Reduction& reduction,
-                       const VolatileReduction& volatileReduction,
                        const Result& zero )
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
-      ResultType* output = &input[ desGridSize ];
-      IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] );
-      IndexType* idxOutput = &idxInput[ desGridSize ];
+      Result* input = cudaReductionBuffer.template getData< Result >();
+      Result* output = &input[ desGridSize ];
+      Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] );
+      Index* idxOutput = &idxInput[ desGridSize ];
 
       while( this->reducedSize > 1 )
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
-         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
-         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
+         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
+         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput );
          std::swap( input, output );
          std::swap( idxInput, idxOutput );
       }
@@ -417,217 +392,220 @@ struct CudaReductionKernelLauncher
 
    protected:
       template< typename DataFetcher,
-                typename Reduction,
-                typename VolatileReduction >
+                typename Reduction >
       int launch( const Index size,
                   const Reduction& reduction,
-                  const VolatileReduction& volatileReduction,
                   const DataFetcher& dataFetcher,
                   const Result& zero,
                   Result* output )
       {
+#ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
          gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
-         const IndexType shmem = (blockSize.x <= 32)
-                  ? 2 * blockSize.x * sizeof( ResultType )
-                  : blockSize.x * sizeof( ResultType );
+         const Index shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * sizeof( Result )
+                  : blockSize.x * sizeof( Result );
 
-        // This is "general", but this method always sets blockSize.x to a specific value,
-        // so runtime switch is not necessary - it only prolongs the compilation time.
+         // This is "general", but this method always sets blockSize.x to a specific value,
+         // so runtime switch is not necessary - it only prolongs the compilation time.
 /*
-        // Depending on the blockSize we generate appropriate template instance.
-        switch( blockSize.x )
-        {
-           case 512:
-              CudaReductionKernel< 512 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case 256:
-              cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel< 256 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case 128:
-              cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel< 128 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  64:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  64 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  32:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  32 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  16:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  16 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-          case   8:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   8 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   4:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   4 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   2:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   2 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   1:
-              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-           default:
-              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-        }
-        TNL_CHECK_CUDA_DEVICE;
+         // Depending on the blockSize we generate appropriate template instance.
+         switch( blockSize.x )
+         {
+            case 512:
+               CudaReductionKernel< 512 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case 256:
+               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel< 256 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case 128:
+               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel< 128 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case  64:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  64 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case  32:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  32 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case  16:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  16 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+           case   8:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   8 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case   4:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   4 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case   2:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   2 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+               break;
+            case   1:
+               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+            default:
+               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+         }
+         TNL_CHECK_CUDA_DEVICE;
 */
 
-        // Check just to future-proof the code setting blockSize.x
-        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-           cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         // Check just to future-proof the code setting blockSize.x
+         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
-           CudaReductionKernel< Reduction_maxThreadsPerBlock >
-           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-        }
-        else {
-           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
-        }
+            CudaReductionKernel< Reduction_maxThreadsPerBlock >
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         }
+         else {
+            TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+         }
 
-        // Return the size of the output array on the CUDA device
-        return gridSize.x;
+         // Return the size of the output array on the CUDA device
+         return gridSize.x;
+#else
+         throw Exceptions::CudaSupportMissing();
+#endif
       }
 
       template< typename DataFetcher,
-                typename Reduction,
-                typename VolatileReduction >
+                typename Reduction >
       int launchWithArgument( const Index size,
                               const Reduction& reduction,
-                              const VolatileReduction& volatileReduction,
                               const DataFetcher& dataFetcher,
                               const Result& zero,
                               Result* output,
                               Index* idxOutput,
                               const Index* idxInput )
       {
+#ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
          gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
-         const IndexType shmem = (blockSize.x <= 32)
-                  ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
-                  : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
+         const Index shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
+                  : blockSize.x * ( sizeof( Result ) + sizeof( Index ) );
 
-        // This is "general", but this method always sets blockSize.x to a specific value,
-        // so runtime switch is not necessary - it only prolongs the compilation time.
+         // This is "general", but this method always sets blockSize.x to a specific value,
+         // so runtime switch is not necessary - it only prolongs the compilation time.
 /*
-        // Depending on the blockSize we generate appropriate template instance.
-        switch( blockSize.x )
-        {
-           case 512:
-              CudaReductionWithArgumentKernel< 512 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case 256:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel< 256 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case 128:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel< 128 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  64:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  64 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  32:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  32 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  16:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  16 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-          case   8:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   8 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   4:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   4 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   2:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   2 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   1:
-              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-           default:
-              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-        }
-        TNL_CHECK_CUDA_DEVICE;
+         // Depending on the blockSize we generate appropriate template instance.
+         switch( blockSize.x )
+         {
+            case 512:
+               CudaReductionWithArgumentKernel< 512 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case 256:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel< 256 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case 128:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel< 128 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case  64:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  64 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case  32:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  32 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case  16:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  16 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+           case   8:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   8 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case   4:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   4 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case   2:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   2 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+               break;
+            case   1:
+               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+            default:
+               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+         }
+         TNL_CHECK_CUDA_DEVICE;
 */
 
-        // Check just to future-proof the code setting blockSize.x
-        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-           cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         // Check just to future-proof the code setting blockSize.x
+         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
-           CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-        }
-        else {
-           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
-        }
+            CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
+         }
+         else {
+            TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+         }
 
-        // return the size of the output array on the CUDA device
-        return gridSize.x;
+         // return the size of the output array on the CUDA device
+         return gridSize.x;
+#else
+         throw Exceptions::CudaSupportMissing();
+#endif
       }
 
 
       const int activeDevice;
       const int blocksdPerMultiprocessor;
       const int desGridSize;
-      const IndexType originalSize;
-      IndexType reducedSize;
+      const Index originalSize;
+      Index reducedSize;
 };
-#endif
 
 } // namespace Algorithms
 } // namespace Containers
diff --git a/src/TNL/Containers/Algorithms/DistributedPrefixSum.h b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h
new file mode 100644
index 0000000000000000000000000000000000000000..b81e2ac94d7959a417582de613e4a8793040c950
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h
@@ -0,0 +1,70 @@
+/***************************************************************************
+                          PrefixSum.h  -  description
+                             -------------------
+    begin                : Aug 16, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/PrefixSum.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< PrefixSumType Type >
+struct DistributedPrefixSum
+{
+   template< typename DistributedVector,
+             typename Reduction >
+   static void
+   perform( DistributedVector& v,
+            typename DistributedVector::IndexType begin,
+            typename DistributedVector::IndexType end,
+            const Reduction& reduction,
+            const typename DistributedVector::RealType zero )
+   {
+      using RealType = typename DistributedVector::RealType;
+      using DeviceType = typename DistributedVector::DeviceType;
+      using CommunicatorType = typename DistributedVector::CommunicatorType;
+
+      const auto group = v.getCommunicationGroup();
+      if( group != CommunicatorType::NullGroup ) {
+         // adjust begin and end for the local range
+         const auto localRange = v.getLocalRange();
+         begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
+         end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin();
+
+         // perform first phase on the local data
+         auto localView = v.getLocalView();
+         const auto blockShifts = PrefixSum< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
+         const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
+
+         // exchange local sums between ranks
+         const int nproc = CommunicatorType::GetSize( group );
+         RealType dataForScatter[ nproc ];
+         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
+         Vector< RealType, Devices::Host > rankSums( nproc );
+         // NOTE: exchanging general data types does not work with MPI
+         CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
+
+         // compute prefix-sum of the per-rank sums
+         PrefixSum< Devices::Host, PrefixSumType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
+
+         // perform second phase: shift by the per-block and per-rank offsets
+         const int rank = CommunicatorType::GetRank( group );
+         PrefixSum< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
+      }
+   }
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 42b8bf28d16842f9d28a624f8995f2ffe959d943..9802a295356db6e53c7f9c3d809c3dc9c70b38c8 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -12,66 +12,78 @@
 
 #pragma once
 
+#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
+
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
 template< typename Device >
-class Multireduction
-{
-};
+struct Multireduction;
 
 template<>
-class Multireduction< Devices::Cuda >
+struct Multireduction< Devices::Host >
 {
-public:
-   template< typename Operation, typename Index >
+   /**
+    * Parameters:
+    *    zero: starting value for reduction
+    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
+    *                 the i-th value to be reduced from the j-th dataset
+    *                 (i = 0,...,size-1; j = 0,...,n-1)
+    *    reduction: callable object representing the reduction operation
+    *               for example, it can be an instance of std::plus, std::logical_and,
+    *               std::logical_or etc.
+    *    size: the size of each dataset
+    *    n: number of datasets to be reduced
+    *    result: output array of size = n
+    */
+   template< typename Result,
+             typename DataFetcher,
+             typename Reduction,
+             typename Index >
    static void
-   reduce( Operation& operation,
-           const int n,
+   reduce( const Result zero,
+           DataFetcher dataFetcher,
+           const Reduction reduction,
            const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
-};
-
-template<>
-class Multireduction< Devices::Host >
-{
-public:
-   template< typename Operation, typename Index >
-   static void
-   reduce( Operation& operation,
            const int n,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
+           Result* result );
 };
 
 template<>
-class Multireduction< Devices::MIC >
+struct Multireduction< Devices::Cuda >
 {
-public:
-   template< typename Operation, typename Index >
+   /**
+    * Parameters:
+    *    zero: starting value for reduction
+    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
+    *                 the i-th value to be reduced from the j-th dataset
+    *                 (i = 0,...,size-1; j = 0,...,n-1)
+    *    reduction: callable object representing the reduction operation
+    *               for example, it can be an instance of std::plus, std::logical_and,
+    *               std::logical_or etc.
+    *    size: the size of each dataset
+    *    n: number of datasets to be reduced
+    *    hostResult: output array of size = n
+    */
+   template< typename Result,
+             typename DataFetcher,
+             typename Reduction,
+             typename Index >
    static void
-   reduce( Operation& operation,
-           const int n,
+   reduce( const Result zero,
+           DataFetcher dataFetcher,
+           const Reduction reduction,
            const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
+           const int n,
+           Result* hostResult );
 };
 
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
-#include "Multireduction_impl.h"
+#include "Multireduction.hpp"
diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4407e6207ba4d67c05ba7f434e2629cfd3c592f
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -0,0 +1,229 @@
+/***************************************************************************
+                          Multireduction_impl.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <memory>  // std::unique_ptr
+
+//#define CUDA_REDUCTION_PROFILING
+
+#include <TNL/Assert.h>
+#include <TNL/Containers/Algorithms/Multireduction.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
+
+#ifdef CUDA_REDUCTION_PROFILING
+#include <TNL/Timer.h>
+#include <iostream>
+#endif
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
+void
+Multireduction< Devices::Host >::
+reduce( const Result zero,
+        DataFetcher dataFetcher,
+        const Reduction reduction,
+        const Index size,
+        const int n,
+        Result* result )
+{
+   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
+   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
+
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+#ifdef HAVE_OPENMP
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
+      {
+         // first thread initializes the result array
+         #pragma omp single nowait
+         {
+            for( int k = 0; k < n; k++ )
+               result[ k ] = zero;
+         }
+
+         // initialize array for thread-local results
+         // (it is accessed as a row-major matrix with n rows and 4 columns)
+         Result r[ n * 4 ];
+         for( int k = 0; k < n * 4; k++ )
+            r[ k ] = zero;
+
+         #pragma omp for nowait
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( int i = 0; i < block_size; i += 4 ) {
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+               }
+            }
+         }
+
+         // the first thread that reaches here processes the last, incomplete block
+         #pragma omp single nowait
+         {
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( Index i = blocks * block_size; i < size; i++ )
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
+            }
+         }
+
+         // local reduction of unrolled results
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
+         }
+
+         // inter-thread reduction of local results
+         #pragma omp critical
+         {
+            for( int k = 0; k < n; k++ )
+               result[ k ] = reduction( result[ k ], r[ 4 * k ] );
+         }
+      }
+   }
+   else {
+#endif
+      if( blocks > 1 ) {
+         // initialize array for unrolled results
+         // (it is accessed as a row-major matrix with n rows and 4 columns)
+         Result r[ n * 4 ];
+         for( int k = 0; k < n * 4; k++ )
+            r[ k ] = zero;
+
+         // main reduction (explicitly unrolled loop)
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( int i = 0; i < block_size; i += 4 ) {
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+               }
+            }
+         }
+
+         // reduction of the last, incomplete block (not unrolled)
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            for( Index i = blocks * block_size; i < size; i++ )
+               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
+         }
+
+         // reduction of unrolled results
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
+
+            // copy the result into the output parameter
+            result[ k ] = _r[ 0 ];
+         }
+      }
+      else {
+         for( int k = 0; k < n; k++ )
+            result[ k ] = zero;
+
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               for( int i = 0; i < block_size; i++ )
+                  result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
+            }
+         }
+
+         for( int k = 0; k < n; k++ ) {
+            for( Index i = blocks * block_size; i < size; i++ )
+               result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
+         }
+      }
+#ifdef HAVE_OPENMP
+   }
+#endif
+}
+
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
+void
+Multireduction< Devices::Cuda >::
+reduce( const Result zero,
+        DataFetcher dataFetcher,
+        const Reduction reduction,
+        const Index size,
+        const int n,
+        Result* hostResult )
+{
+   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
+   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      Timer timer;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // start the reduction on the GPU
+   Result* deviceAux1 = nullptr;
+   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // transfer the reduced data from device to host
+   std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
+   ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // finish the reduction on the host
+   auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
+   Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
+   #endif
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h
deleted file mode 100644
index ebd0ad2562a4795fcea6abeb9287b5063b92c408..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ /dev/null
@@ -1,327 +0,0 @@
-/***************************************************************************
-                          Multireduction_impl.h  -  description
-                             -------------------
-    begin                : May 13, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "Multireduction.h"
-
-//#define CUDA_REDUCTION_PROFILING
-
-#include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
-#include <TNL/Containers/Algorithms/ArrayOperations.h>
-#include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
-
-#ifdef CUDA_REDUCTION_PROFILING
-#include <TNL/Timer.h>
-#include <iostream>
-#endif
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-/****
- * Arrays smaller than the following constant are reduced on CPU.
- */
-//static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256;
-// TODO: benchmarks with different values
-static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
-
-/*
- * Parameters:
- *    operation: the operation used for reduction
- *    n: number of datasets to be reduced
- *    size: the size of each dataset
- *    deviceInput1: input array of size = n * ldInput1
- *    ldInput1: leading dimension of the deviceInput1 array
- *    deviceInput2: either nullptr or input array of size = size
- *    hostResult: output array of size = n
- */
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::Cuda >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* deviceInput1,
-        const Index ldInput1,
-        const typename Operation::DataType2* deviceInput2,
-        typename Operation::ResultType* hostResult )
-{
-#ifdef HAVE_CUDA
-   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
-   TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." );
-
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
-
-   /***
-    * First check if the input array(s) is/are large enough for the multireduction on GPU.
-    * Otherwise copy it/them to host and multireduce on CPU.
-    */
-   if( n * ldInput1 < Multireduction_minGpuDataSize ) {
-      DataType1 hostArray1[ Multireduction_minGpuDataSize ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray1, deviceInput1, n * ldInput1 );
-      if( deviceInput2 ) {
-         using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
-         _DT2 hostArray2[ Multireduction_minGpuDataSize ];
-         ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray2, (_DT2*) deviceInput2, size );
-         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
-      }
-      else {
-         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult );
-      }
-      return;
-   }
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      Timer timer;
-      timer.reset();
-      timer.start();
-   #endif
-
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1 = nullptr;
-   const IndexType reducedSize = CudaMultireductionKernelLauncher( operation,
-                                                                   n,
-                                                                   size,
-                                                                   deviceInput1,
-                                                                   ldInput1,
-                                                                   deviceInput2,
-                                                                   deviceAux1 );
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
-      timer.reset();
-      timer.start();
-   #endif
-
-   /***
-    * Transfer the reduced data from device to host.
-    */
-   ResultType resultArray[ n * reducedSize ];
-   ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray, deviceAux1, n * reducedSize );
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
-      timer.reset();
-      timer.start();
-   #endif
-
-//   std::cout << "resultArray = [";
-//   for( int i = 0; i < n * reducedSize; i++ ) {
-//      std::cout << resultArray[ i ];
-//      if( i < n * reducedSize - 1 )
-//         std::cout << ", ";
-//   }
-//   std::cout << "]" << std::endl;
-
-   /***
-    * Reduce the data on the host system.
-    */
-   LaterReductionOperation laterReductionOperation;
-   Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (void*) nullptr, hostResult );
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
-   #endif
-
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-};
-
-/*
- * Parameters:
- *    operation: the operation used for reduction
- *    n: number of datasets to be reduced
- *    size: the size of each dataset
- *    input1: input array of size = n * ldInput1
- *    ldInput1: leading dimension of the input1 array
- *    input2: either nullptr or input array of size = size
- *    hostResult: output array of size = n
- */
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::Host >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const Index ldInput1,
-        const typename Operation::DataType2* input2,
-        typename Operation::ResultType* result )
-{
-   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
-   TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." );
-
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-
-   constexpr int block_size = 128;
-   const int blocks = size / block_size;
-
-#ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
-#pragma omp parallel
-   {
-      // first thread initializes the result array
-      #pragma omp single nowait
-      {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = operation.initialValue();
-      }
-
-      // initialize array for thread-local results
-      // (it is accessed as a row-major matrix with n rows and 4 columns)
-      ResultType r[ n * 4 ];
-      for( int k = 0; k < n * 4; k++ )
-         r[ k ] = operation.initialValue();
-
-      #pragma omp for nowait
-      for( int b = 0; b < blocks; b++ ) {
-         const IndexType offset = b * block_size;
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
-               operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
-               operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
-               operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
-            }
-         }
-      }
-
-      // the first thread that reaches here processes the last, incomplete block
-      #pragma omp single nowait
-      {
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
-         }
-      }
-
-      // local reduction of unrolled results
-      for( int k = 0; k < n; k++ ) {
-         ResultType* _r = r + 4 * k;
-         operation.commonReduction( _r[ 0 ], _r[ 1 ] );
-         operation.commonReduction( _r[ 0 ], _r[ 2 ] );
-         operation.commonReduction( _r[ 0 ], _r[ 3 ] );
-      }
-
-      // inter-thread reduction of local results
-      #pragma omp critical
-      {
-         for( int k = 0; k < n; k++ )
-            operation.commonReduction( result[ k ], r[ 4 * k ] );
-      }
-   }
-   else {
-#endif
-      if( blocks > 1 ) {
-         // initialize array for unrolled results
-         // (it is accessed as a row-major matrix with n rows and 4 columns)
-         ResultType r[ n * 4 ];
-         for( int k = 0; k < n * 4; k++ )
-            r[ k ] = operation.initialValue();
-
-         // main reduction (explicitly unrolled loop)
-         for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               const DataType1* _input1 = input1 + k * ldInput1;
-               ResultType* _r = r + 4 * k;
-               for( int i = 0; i < block_size; i += 4 ) {
-                  operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
-                  operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
-                  operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
-                  operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
-               }
-            }
-         }
-
-         // reduction of the last, incomplete block (not unrolled)
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
-         }
-
-         // reduction of unrolled results
-         for( int k = 0; k < n; k++ ) {
-            ResultType* _r = r + 4 * k;
-            operation.commonReduction( _r[ 0 ], _r[ 1 ] );
-            operation.commonReduction( _r[ 0 ], _r[ 2 ] );
-            operation.commonReduction( _r[ 0 ], _r[ 3 ] );
-
-            // copy the result into the output parameter
-            result[ k ] = _r[ 0 ];
-         }
-      }
-      else {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = operation.initialValue();
-
-         for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               const DataType1* _input1 = input1 + k * ldInput1;
-               for( int i = 0; i < block_size; i++ )
-                  operation.firstReduction( result[ k ], offset + i, _input1, input2 );
-            }
-         }
-
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( result[ k ], i, _input1, input2 );
-         }
-      }
-#ifdef HAVE_OPENMP
-   }
-#endif
-}
-
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::MIC >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const Index ldInput1,
-        const typename Operation::DataType2* input2,
-        typename Operation::ResultType* result )
-{
-   throw Exceptions::NotImplementedError("Multireduction is not implemented for MIC.");
-}
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h
index 715d6f1b982c53be5d18b9c3216af955aa8430b8..2b0e40458089ebfe76f1573f07891983103c848d 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.h
+++ b/src/TNL/Containers/Algorithms/PrefixSum.h
@@ -14,92 +14,121 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
+enum class PrefixSumType {
+   Exclusive,
+   Inclusive
+};
+
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class PrefixSum {};
+struct PrefixSum;
 
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class SegmentedPrefixSum {};
+struct SegmentedPrefixSum;
 
 
 template< PrefixSumType Type >
-class PrefixSum< Devices::Host, Type >
+struct PrefixSum< Devices::Host, Type >
 {
-   public:
-      template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation >
-      static void
-      perform( Vector& v,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction >
+   static void
+   perform( Vector& v,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename Reduction >
+   static auto
+   performFirstPhase( Vector& v,
+                      const typename Vector::IndexType begin,
+                      const typename Vector::IndexType end,
+                      const Reduction& reduction,
+                      const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( Vector& v,
+                       const BlockShifts& blockShifts,
+                       const typename Vector::IndexType begin,
+                       const typename Vector::IndexType end,
+                       const Reduction& reduction,
+                       const typename Vector::RealType shift );
 };
 
 template< PrefixSumType Type >
-class PrefixSum< Devices::Cuda, Type >
+struct PrefixSum< Devices::Cuda, Type >
 {
-   public:
-      template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation >
-      static void
-      perform( Vector& v,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction >
+   static void
+   perform( Vector& v,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename Reduction >
+   static auto
+   performFirstPhase( Vector& v,
+                      const typename Vector::IndexType begin,
+                      const typename Vector::IndexType end,
+                      const Reduction& reduction,
+                      const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( Vector& v,
+                       const BlockShifts& blockShifts,
+                       const typename Vector::IndexType begin,
+                       const typename Vector::IndexType end,
+                       const Reduction& reduction,
+                       const typename Vector::RealType shift );
 };
 
 template< PrefixSumType Type >
-class SegmentedPrefixSum< Devices::Host, Type >
+struct SegmentedPrefixSum< Devices::Host, Type >
 {
-   public:
-      template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation,
-                typename Flags >
-      static void
-      perform( Vector& v,
-               Flags& flags,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
 };
 
 template< PrefixSumType Type >
-class SegmentedPrefixSum< Devices::Cuda, Type >
+struct SegmentedPrefixSum< Devices::Cuda, Type >
 {
-   public:
-      template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation,
-                typename Flags >
-      static void
-      perform( Vector& v,
-               Flags& flags,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
 };
 
-
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index d3c3b3071928e89a284bf1db4acea1e8c30ccf3e..8af19d09a5ae3a741c8cd9a612c1dad60bdb694e 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -14,103 +14,228 @@
 
 #include "PrefixSum.h"
 
-//#define CUDA_REDUCTION_PROFILING
-
 #include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
-#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Array.h>
 #include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
-#ifdef CUDA_REDUCTION_PROFILING
-#include <iostream>
-#include <TNL/Timer.h>
-#endif
-
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
-/****
- * Arrays smaller than the following constant
- * are reduced on CPU. The constant must not be larger
- * than maximal CUDA grid size.
- */
-static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256;
-
-////
-// PrefixSum on host
 template< PrefixSumType Type >
-template< typename Vector,
-          typename PrefixSumOperation,
-          typename VolatilePrefixSumOperation >
+   template< typename Vector,
+             typename Reduction >
 void
 PrefixSum< Devices::Host, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatilePrefixSum,
-         const typename Vector::RealType& zero )
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
+{
+#ifdef HAVE_OPENMP
+   const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
+   performSecondPhase( v, blockShifts, begin, end, reduction, zero );
+#else
+   // sequential prefix-sum does not need a second phase
+   performFirstPhase( v, begin, end, reduction, zero );
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+PrefixSum< Devices::Host, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::RealType zero )
 {
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   // TODO: parallelize with OpenMP
-   if( Type == PrefixSumType::Inclusive )
+#ifdef HAVE_OPENMP
+   const int threads = Devices::Host::getMaxThreadsCount();
+   Array< RealType, Devices::Host > block_sums( threads + 1 );
+   block_sums[ 0 ] = zero;
+
+   #pragma omp parallel num_threads(threads)
+   {
+      // init
+      const int thread_idx = omp_get_thread_num();
+      RealType block_sum = zero;
+
+      // perform prefix-sum on blocks statically assigned to threads
+      if( Type == PrefixSumType::Inclusive ) {
+         #pragma omp for schedule(static)
+         for( IndexType i = begin; i < end; i++ ) {
+            block_sum = reduction( block_sum, v[ i ] );
+            v[ i ] = block_sum;
+         }
+      }
+      else {
+         #pragma omp for schedule(static)
+         for( IndexType i = begin; i < end; i++ ) {
+            const RealType x = v[ i ];
+            v[ i ] = block_sum;
+            block_sum = reduction( block_sum, x );
+         }
+      }
+
+      // write the block sums into the buffer
+      block_sums[ thread_idx + 1 ] = block_sum;
+   }
+
+   // block_sums now contains sums of numbers in each block. The first phase
+   // ends by computing prefix-sum of this array.
+   for( int i = 1; i < threads + 1; i++ )
+      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
+
+   // block_sums now contains shift values for each block - to be used in the second phase
+   return block_sums;
+#else
+   if( Type == PrefixSumType::Inclusive ) {
       for( IndexType i = begin + 1; i < end; i++ )
-         reduction( v[ i ], v[ i - 1 ] );
+         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
+   }
    else // Exclusive prefix sum
    {
-      RealType aux( v[ begin ] );
-      v[ begin ] = zero;
-      for( IndexType i = begin + 1; i < end; i++ )
-      {
-         RealType x = v[ i ];
+      RealType aux = zero;
+      for( IndexType i = begin; i < end; i++ ) {
+         const RealType x = v[ i ];
          v[ i ] = aux;
-         reduction( aux, x );
+         aux = reduction( aux, x );
       }
    }
+
+   return 0;
+#endif
 }
 
-////
-// PrefixSum on CUDA device
 template< PrefixSumType Type >
-template< typename Vector,
-          typename PrefixSumOperation,
-          typename VolatilePrefixSumOperation >
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+void
+PrefixSum< Devices::Host, Type >::
+performSecondPhase( Vector& v,
+                    const BlockShifts& blockShifts,
+                    const typename Vector::IndexType begin,
+                    const typename Vector::IndexType end,
+                    const Reduction& reduction,
+                    const typename Vector::RealType shift )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+#ifdef HAVE_OPENMP
+   const int threads = blockShifts.getSize() - 1;
+
+   // launch exactly the same number of threads as in the first phase
+   #pragma omp parallel num_threads(threads)
+   {
+      const int thread_idx = omp_get_thread_num();
+      const RealType offset = reduction( blockShifts[ thread_idx ], shift );
+
+      // shift intermediate results by the offset
+      #pragma omp for schedule(static)
+      for( IndexType i = begin; i < end; i++ )
+         v[ i ] = reduction( v[ i ], offset );
+   }
+#else
+   for( IndexType i = begin; i < end; i++ )
+      v[ i ] = reduction( v[ i ], shift );
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename Reduction >
 void
 PrefixSum< Devices::Cuda, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatileReduction,
-         const typename Vector::RealType& zero )
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
 {
+#ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
-   using IndexType = typename Vector::IndexType;
+
+   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform(
+      end - begin,
+      &v[ begin ],  // input
+      &v[ begin ],  // output
+      reduction,
+      zero );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+PrefixSum< Devices::Cuda, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::RealType zero )
+{
 #ifdef HAVE_CUDA
-   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(
-      ( IndexType ) ( end - begin ),
-      ( IndexType ) 256,
-      &v[ begin ],
-      &v[ begin ],
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
+      end - begin,
+      &v[ begin ],  // input
+      &v[ begin ],  // output
       reduction,
-      volatileReduction,
       zero );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+void
+PrefixSum< Devices::Cuda, Type >::
+performSecondPhase( Vector& v,
+                    const BlockShifts& blockShifts,
+                    const typename Vector::IndexType begin,
+                    const typename Vector::IndexType end,
+                    const Reduction& reduction,
+                    const typename Vector::RealType shift )
+{
+#ifdef HAVE_CUDA
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
+      end - begin,
+      &v[ begin ],  // output
+      blockShifts.getData(),
+      reduction,
+      shift );
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
 }
 
 
-////
-// PrefixSum on host
 template< PrefixSumType Type >
    template< typename Vector,
-             typename PrefixSumOperation,
-             typename VolatilePrefixSumOperation,
+             typename Reduction,
              typename Flags >
 void
 SegmentedPrefixSum< Devices::Host, Type >::
@@ -118,9 +243,8 @@ perform( Vector& v,
          Flags& flags,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatilePrefixSum,
-         const typename Vector::RealType& zero )
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
 {
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
@@ -130,7 +254,7 @@ perform( Vector& v,
    {
       for( IndexType i = begin + 1; i < end; i++ )
          if( ! flags[ i ] )
-            reduction( v[ i ], v[ i - 1 ] );
+            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
    }
    else // Exclusive prefix sum
    {
@@ -142,17 +266,14 @@ perform( Vector& v,
          if( flags[ i ] )
             aux = zero;
          v[ i ] = aux;
-         reduction( aux, x );
+         aux = reduction( aux, x );
       }
    }
 }
 
-////
-// PrefixSum on CUDA device
 template< PrefixSumType Type >
    template< typename Vector,
-             typename PrefixSumOperation,
-             typename VolatilePrefixSumOperation,
+             typename Reduction,
              typename Flags >
 void
 SegmentedPrefixSum< Devices::Cuda, Type >::
@@ -160,23 +281,16 @@ perform( Vector& v,
          Flags& flags,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatileReduction,
-         const typename Vector::RealType& zero )
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
 {
+#ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
-   using IndexType = typename Vector::IndexType;
-#ifdef HAVE_CUDA
-   throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET
-   /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(
-      ( IndexType ) ( end - begin ),
-      ( IndexType ) 256,
-      &v[ begin ],
-      &v[ begin ],
-      reduction,
-      volatileReduction,
-      zero );*/
+
+   throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." );
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
 }
 
diff --git a/src/TNL/Containers/Algorithms/PrefixSumType.h b/src/TNL/Containers/Algorithms/PrefixSumType.h
deleted file mode 100644
index ba03adfe6bafc66be96c96172aa97572727ee80f..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/PrefixSumType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/***************************************************************************
-                          PrefixSumType.h  -  description
-                             -------------------
-    begin                : Jun 6, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-enum class PrefixSumType {
-   Exclusive,
-   Inclusive
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index e2c6d5295606bef5c1e1bf71f988b3a220fe39b2..41b000221bb5e7af34828694fbc752607f4ca0c4 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -13,103 +13,64 @@
 #pragma once
 
 #include <utility>  // std::pair
+#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
 template< typename Device >
-class Reduction;
+struct Reduction;
 
 template<>
-class Reduction< Devices::Host >
+struct Reduction< Devices::Host >
 {
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           const ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static std::pair< Index, Result >
+   reduceWithArgument( const Index size,
+                       const ReductionOperation& reduction,
+                       DataFetcher& dataFetcher,
+                       const Result& zero );
 };
 
 template<>
-class Reduction< Devices::Cuda >
+struct Reduction< Devices::Cuda >
 {
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           const ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
-};
-
-template<>
-class Reduction< Devices::MIC >
-{
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
-
-     template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static std::pair< Index, Result >
+   reduceWithArgument( const Index size,
+                       const ReductionOperation& reduction,
+                       DataFetcher& dataFetcher,
+                       const Result& zero );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 9d54fd7b1e77c404f4bbfc89dafd3d0c99d9ff24..229af13797f82a4f27f67bc81a5bfb6886a65604 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -12,13 +12,11 @@
 
 #pragma once
 
+#include <memory>  // std::unique_ptr
 
 //#define CUDA_REDUCTION_PROFILING
 
-#include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/CudaReductionKernel.h>
 
@@ -43,58 +41,54 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Host >::
 reduce( const Index size,
-        ReductionOperation& reduction,
-        VolatileReductionOperation& volatileReduction,
+        const ReductionOperation& reduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
-      ResultType result = zero;
-#pragma omp parallel
+      Result result = zero;
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
-         ResultType r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { zero, zero, zero, zero  };
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               reduction( r[ 0 ], dataFetcher( offset + i ) );
-               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // the first thread that reaches here processes the last, incomplete block
          #pragma omp single nowait
          {
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               reduction( r[ 0 ], dataFetcher( i ) );
+            for( Index i = blocks * block_size; i < size; i++ )
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
          }
 
          // local reduction of unrolled results
-         reduction( r[ 0 ], r[ 2 ] );
-         reduction( r[ 1 ], r[ 3 ] );
-         reduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            reduction( result, r[ 0 ] );
+            result = reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -103,34 +97,33 @@ reduce( const Index size,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         ResultType r[ 4 ] = { zero, zero, zero, zero };
+         Result r[ 4 ] = { zero, zero, zero, zero };
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               reduction( r[ 0 ], dataFetcher( offset + i ) );
-               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
-         for( IndexType i = blocks * block_size; i < size; i++ )
-            reduction( r[ 0 ], dataFetcher( i ) );
-            //operation.dataFetcher( r[ 0 ], i, input1, input2 );
+         for( Index i = blocks * block_size; i < size; i++ )
+            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
 
          // reduction of unrolled results
-         reduction( r[ 0 ], r[ 2 ] );
-         reduction( r[ 1 ], r[ 3 ] );
-         reduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
          return r[ 0 ];
       }
       else {
-         ResultType result = zero;
-         for( IndexType i = 0; i < size; i++ )
-            reduction( result, dataFetcher( i ) );
+         Result result = zero;
+         for( Index i = 0; i < size; i++ )
+            result = reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
@@ -141,36 +134,32 @@ reduce( const Index size,
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 std::pair< Index, Result >
 Reduction< Devices::Host >::
 reduceWithArgument( const Index size,
-                    ReductionOperation& reduction,
-                    VolatileReductionOperation& volatileReduction,
+                    const ReductionOperation& reduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
       std::pair< Index, Result > result( -1, zero );
-#pragma omp parallel
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
-         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
-         ResultType r[ 4 ] = { zero, zero, zero, zero  };
+         Index arg[ 4 ] = { 0, 0, 0, 0 };
+         Result r[ 4 ] = { zero, zero, zero, zero  };
          bool initialized( false );
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                if( ! initialized ) {
                   arg[ 0 ] = offset + i;
@@ -194,7 +183,7 @@ reduceWithArgument( const Index size,
          // the first thread that reaches here processes the last, incomplete block
          #pragma omp single nowait
          {
-            for( IndexType i = blocks * block_size; i < size; i++ )
+            for( Index i = blocks * block_size; i < size; i++ )
                reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
          }
 
@@ -217,13 +206,13 @@ reduceWithArgument( const Index size,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
-         ResultType r[ 4 ] = { zero, zero, zero, zero };
+         Index arg[ 4 ] = { 0, 0, 0, 0 };
+         Result r[ 4 ] = { zero, zero, zero, zero };
          bool initialized( false );
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                if( ! initialized )
                {
@@ -246,7 +235,7 @@ reduceWithArgument( const Index size,
          }
 
          // reduction of the last, incomplete block (not unrolled)
-         for( IndexType i = blocks * block_size; i < size; i++ )
+         for( Index i = blocks * block_size; i < size; i++ )
             reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
 
          // reduction of unrolled results
@@ -257,7 +246,7 @@ reduceWithArgument( const Index size,
       }
       else {
          std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
-         for( IndexType i = 1; i < size; i++ )
+         for( Index i = 1; i < size; i++ )
             reduction( result.first, i, result.second, dataFetcher( i ) );
          return result;
       }
@@ -269,28 +258,18 @@ reduceWithArgument( const Index size,
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Cuda >::
 reduce( const Index size,
-        ReductionOperation& reduction,
-        VolatileReductionOperation& volatileReduction,
+        const ReductionOperation& reduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
-#ifdef HAVE_CUDA
-
-   using IndexType = Index;
-   using ResultType = Result;
-
-   /***
-    * Only fundamental and pointer types can be safely reduced on host. Complex
-    * objects stored on the device might contain pointers into the device memory,
-    * in which case reduction on host might fail.
-    */
-   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
-   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
+   // Only fundamental and pointer types can be safely reduced on host. Complex
+   // objects stored on the device might contain pointers into the device memory,
+   // in which case reduction on host might fail.
+   constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value;
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -298,18 +277,16 @@ reduce( const Index size,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+   CudaReductionKernelLauncher< Index, Result > reductionLauncher( size );
 
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1( 0 );
-   IndexType reducedSize = reductionLauncher.start(
+   // start the reduction on the GPU
+   Result* deviceAux1( 0 );
+   const int reducedSize = reductionLauncher.start(
       reduction,
-      volatileReduction,
       dataFetcher,
       zero,
       deviceAux1 );
+
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -318,10 +295,21 @@ reduce( const Index size,
    #endif
 
    if( can_reduce_later_on_host ) {
-      /***
-       * Transfer the reduced data from device to host.
-       */
-      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      // transfer the reduced data from device to host
+      std::unique_ptr< Result[] > resultArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Result[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Result[ reducedSize ]
+         #endif
+      };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -331,11 +319,9 @@ reduce( const Index size,
          timer.start();
       #endif
 
-      /***
-       * Reduce the data on the host system.
-       */
-      auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
+      // finish the reduction on the host
+      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
+      const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -344,10 +330,8 @@ reduce( const Index size,
       return result;
    }
    else {
-      /***
-       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
-       */
-      auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
+      // data can't be safely reduced on host, so continue with the reduction on the GPU
+      auto result = reductionLauncher.finish( reduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -358,36 +342,23 @@ reduce( const Index size,
 
       return result;
    }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-};
+}
 
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 std::pair< Index, Result >
 Reduction< Devices::Cuda >::
 reduceWithArgument( const Index size,
-                    ReductionOperation& reduction,
-                    VolatileReductionOperation& volatileReduction,
+                    const ReductionOperation& reduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
-#ifdef HAVE_CUDA
-
-   using IndexType = Index;
-   using ResultType = Result;
-
-   /***
-    * Only fundamental and pointer types can be safely reduced on host. Complex
-    * objects stored on the device might contain pointers into the device memory,
-    * in which case reduction on host might fail.
-    */
-   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
-   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
+   // Only fundamental and pointer types can be safely reduced on host. Complex
+   // objects stored on the device might contain pointers into the device memory,
+   // in which case reduction on host might fail.
+   constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value;
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -395,20 +366,18 @@ reduceWithArgument( const Index size,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+   CudaReductionKernelLauncher< Index, Result > reductionLauncher( size );
 
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1( nullptr );
-   IndexType* deviceIndexes( nullptr );
-   IndexType reducedSize = reductionLauncher.startWithArgument(
+   // start the reduction on the GPU
+   Result* deviceAux1( nullptr );
+   Index* deviceIndexes( nullptr );
+   const int reducedSize = reductionLauncher.startWithArgument(
       reduction,
-      volatileReduction,
       dataFetcher,
       zero,
       deviceAux1,
       deviceIndexes );
+
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -417,11 +386,35 @@ reduceWithArgument( const Index size,
    #endif
 
    if( can_reduce_later_on_host ) {
-      /***
-       * Transfer the reduced data from device to host.
-       */
-      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
-      std::unique_ptr< IndexType[] > indexArray{ new IndexType[ reducedSize ] };
+      // transfer the reduced data from device to host
+      std::unique_ptr< Result[] > resultArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Result[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Result[ reducedSize ]
+         #endif
+      };
+      std::unique_ptr< Index[] > indexArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Index[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Index[ reducedSize ]
+         #endif
+      };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );
 
@@ -432,12 +425,10 @@ reduceWithArgument( const Index size,
          timer.start();
       #endif
 
-      /***
-       * Reduce the data on the host system.
-       */
-      //auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      //const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero );
-      for( IndexType i = 1; i < reducedSize; i++ )
+      // finish the reduction on the host
+//      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
+//      const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
+      for( Index i = 1; i < reducedSize; i++ )
          reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -447,10 +438,8 @@ reduceWithArgument( const Index size,
       return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] );
    }
    else {
-      /***
-       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
-       */
-      auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero );
+      // data can't be safely reduced on host, so continue with the reduction on the GPU
+      auto result = reductionLauncher.finishWithArgument( reduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -461,9 +450,6 @@ reduceWithArgument( const Index size,
 
       return result;
    }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/ReductionOperations.h b/src/TNL/Containers/Algorithms/ReductionOperations.h
deleted file mode 100644
index 33ef84b1c7f2fb0f9cb5ea4b72fdb9776fcce08c..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/ReductionOperations.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/***************************************************************************
-                          ReductionOperations.h  -  description
-                             -------------------
-    begin                : Mar 22, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <limits>  // std::numeric_limits
-
-#include <TNL/Math.h>
-#include <TNL/Devices/CudaCallable.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-/*
- * Unary operations: reduction on one input vector.
- */
-
-template< typename Data, typename Result = Data >
-class ParallelReductionSum
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const ResultType& data )
-   {
-      result += data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const ResultType& data )
-   {
-      result += data;
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionMin
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, data1[ index ] );
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = TNL::min( result, data );
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = TNL::min( result, data );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionMax
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, data1[ index ] );
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = TNL::max( result, data );
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = TNL::max( result, data );
-   }
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionLogicalAnd
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return true; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = result && data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = result && data;
-   }
-};
-
-
-template< typename Data, typename Result = bool >
-class ParallelReductionLogicalOr
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalOr< Result >;
-
-   static constexpr Result initialValue() { return false; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result || data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = result || data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = result || data;
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsSum : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::abs( data1[ index ] );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsMin : public ParallelReductionMin< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, TNL::abs( data1[ index ] ) );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsMax : public ParallelReductionMax< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, TNL::abs( data1[ index ] ) );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionL2Norm : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      const Data& aux = data1[ index ];
-      result += aux * aux;
-   }
-};
-
-
-template< typename Data, typename Result = Data, typename PType = Data >
-class ParallelReductionLpNorm : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   void setPower( const PType p )
-   {
-      this->p = p;
-   }
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::pow( TNL::abs( data1[ index ] ), p );
-   }
-
-protected:
-   PType p;
-};
-
-
-/*
- * Binary operations: reduction on two input vectors.
- */
-
-template< typename Data1, typename Data2, typename Result = bool >
-class ParallelReductionEqualities : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return true; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] == data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = bool >
-class ParallelReductionInequalities : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return false; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] != data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionScalarProduct : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ] * data2[ index ];
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffSum : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ] - data2[ index ];
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffMin : public ParallelReductionMin< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffMax : public ParallelReductionMax< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsSum : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::abs( data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsMin : public ParallelReductionMin< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsMax : public ParallelReductionMax< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffL2Norm : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      const ResultType aux = data2[ index ] - data1[ index ];
-      result += aux * aux;
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1, typename PType = Data1 >
-class ParallelReductionDiffLpNorm : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   void setPower( const PType p )
-   {
-      this->p = p;
-   }
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p );
-   }
-
-protected:
-   PType p;
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionContainsValue : public ParallelReductionLogicalOr< Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = Data;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalOr< Result >;
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result || ( data1[ index ] == value );
-   }
-
-   void setValue( const Data& v )
-   {
-      this->value = v;
-   }
-
-protected:
-   Data value;
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionContainsOnlyValue : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = Data;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] == value );
-   }
-
-   void setValue( const Data& v )
-   {
-      this->value = v;
-   }
-
-protected:
-   Data value;
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/VectorAssignment.h b/src/TNL/Containers/Algorithms/VectorAssignment.h
index 36829be82a70c2ff74f39d275c6a411c21e34873..c861579f4e90810b024d2d84b5ea3c2eeaf92234 100644
--- a/src/TNL/Containers/Algorithms/VectorAssignment.h
+++ b/src/TNL/Containers/Algorithms/VectorAssignment.h
@@ -12,7 +12,6 @@
 
 #include <TNL/TypeTraits.h>
 #include <TNL/ParallelFor.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h
deleted file mode 100644
index a12b310e36b7d9bf2b458bceaf9db1547162863c..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/VectorOperations.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          VectorOperations.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Containers/Algorithms/PrefixSum.h>
-#include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< typename Device >
-class VectorOperations{};
-
-template<>
-class VectorOperations< Devices::Host >
-{
-public:
-   template< Algorithms::PrefixSumType Type,
-             typename Vector >
-   static void prefixSum( Vector& v,
-                          const typename Vector::IndexType begin,
-                          const typename Vector::IndexType end );
-
-   template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-   static void segmentedPrefixSum( Vector& v,
-                                   Flags& f,
-                                   const typename Vector::IndexType begin,
-                                   const typename Vector::IndexType end );
-};
-
-template<>
-class VectorOperations< Devices::Cuda >
-{
-public:
-   template< Algorithms::PrefixSumType Type,
-             typename Vector >
-   static void prefixSum( Vector& v,
-                          const typename Vector::IndexType begin,
-                          const typename Vector::IndexType end );
-
-   template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-   static void segmentedPrefixSum( Vector& v,
-                                   Flags& f,
-                                   const typename Vector::IndexType begin,
-                                   const typename Vector::IndexType end );
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
-
-#include <TNL/Containers/Algorithms/VectorOperationsHost_impl.h>
-#include <TNL/Containers/Algorithms/VectorOperationsCuda_impl.h>
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
deleted file mode 100644
index 4b53dbcf73eeb8b90a5388bbe7b1d443ff24574d..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/***************************************************************************
-                          VectorOperationsCuda_impl.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< Algorithms::PrefixSumType Type,
-          typename Vector >
-void
-VectorOperations< Devices::Cuda >::
-prefixSum( Vector& v,
-           typename Vector::IndexType begin,
-           typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   PrefixSum< Devices::Cuda, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-void
-VectorOperations< Devices::Cuda >::
-segmentedPrefixSum( Vector& v,
-                    Flags& f,
-                    typename Vector::IndexType begin,
-                    typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   SegmentedPrefixSum< Devices::Cuda, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
deleted file mode 100644
index 50d591b9f31d11097eeb1047bdba465dda906b95..0000000000000000000000000000000000000000
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
-                          VectorOperationsHost_impl.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Math.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< Algorithms::PrefixSumType Type, typename Vector >
-void
-VectorOperations< Devices::Host >::
-prefixSum( Vector& v,
-           typename Vector::IndexType begin,
-           typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   PrefixSum< Devices::Host, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-void
-VectorOperations< Devices::Host >::
-segmentedPrefixSum( Vector& v,
-                    Flags& f,
-                    typename Vector::IndexType begin,
-                    typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   SegmentedPrefixSum< Devices::Host, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 75f204dccba363da9b5d079091680bd4272f03df..3438ddbd00378c0e3870a49b60f8151ef165a810 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -127,13 +127,8 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVector& operator/=( const Vector& vector );
 
-   void computePrefixSum();
-
-   void computePrefixSum( IndexType begin, IndexType end );
-
-   void computeExclusivePrefixSum();
-
-   void computeExclusivePrefixSum( IndexType begin, IndexType end );
+   template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive >
+   void prefixSum( IndexType begin = 0, IndexType end = 0 );
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index 73781c90640eebe0270082174e0a871b1fce511d..0a6ac1547f1f1b6f6fae3b92d1dcd7a2804a83b9 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -13,8 +13,7 @@
 #pragma once
 
 #include "DistributedVector.h"
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
-#include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Containers/Algorithms/DistributedPrefixSum.h>
 
 namespace TNL {
 namespace Containers {
@@ -299,44 +298,14 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Communicator >
+   template< Algorithms::PrefixSumType Type >
 void
 DistributedVector< Real, Device, Index, Communicator >::
-computePrefixSum()
+prefixSum( IndexType begin, IndexType end )
 {
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum()
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
+   if( end == 0 )
+      end = this->getSize();
+   Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 0ab9232001f84c7ea954f02dd21d55830b414cad..f0e7d91278f06f4ecf1f2688083b79c3e2ca66e9 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -127,13 +127,8 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVectorView& operator/=( const Vector& vector );
 
-   void computePrefixSum();
-
-   void computePrefixSum( IndexType begin, IndexType end );
-
-   void computeExclusivePrefixSum();
-
-   void computeExclusivePrefixSum( IndexType begin, IndexType end );
+   template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive >
+   void prefixSum( IndexType begin = 0, IndexType end = 0 );
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 38715161cc46f25b2aa705b3cdb6c302aa1cad38..0268e35da5b2dc9ea1eb24dbec738015abe64a84 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -13,8 +13,7 @@
 #pragma once
 
 #include "DistributedVectorView.h"
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
-#include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Containers/Algorithms/DistributedPrefixSum.h>
 
 namespace TNL {
 namespace Containers {
@@ -275,44 +274,14 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Communicator >
+   template< Algorithms::PrefixSumType Type >
 void
 DistributedVectorView< Real, Device, Index, Communicator >::
-computePrefixSum()
+prefixSum( IndexType begin, IndexType end )
 {
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum()
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
+   if( end == 0 )
+      end = this->getSize();
+   Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index ff533e7813028f22914a129575fbe31bccebae14..616ad5807864a1b1f2cd7b5af765132021c99a24 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -65,9 +65,7 @@ struct VectorComparison< T1, T2, false >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -97,9 +95,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -112,9 +108,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -127,9 +121,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -142,9 +134,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -160,9 +150,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -176,9 +164,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -187,9 +173,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -198,9 +182,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -209,9 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -227,9 +207,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -243,9 +221,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -254,9 +230,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -265,9 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -276,9 +248,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 405b0014ac77a5287bff0680c78b671c8104ab51..1d3077b432303bf5ecddc7f4ff48b71d1a5b9c46 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -2187,12 +2187,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2201,19 +2199,17 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2222,7 +2218,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2232,12 +2228,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2250,19 +2244,17 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2275,7 +2267,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2285,12 +2277,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2303,19 +2293,17 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2328,7 +2316,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index 4e569a45d46ea5db0a7bb24e9418ae064e30d5ac..43940417bb0aa7fa97afe17e4fee3e4ce703a819 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -70,15 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) {
-         if( a > b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::max() );
+      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::max() );
       result.first = gatheredResults[ result.first ].first;
    }
    return result;
@@ -135,15 +127,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) {
-         if( a < b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::lowest() );
       result.first = gatheredResults[ result.first ].first;
    }
    return result;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 5a069b0b9958bf794b204e1236f530201dd4748b..f01d8a68e153f919871106d0039d6807f575fe1f 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -2110,12 +2110,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2124,19 +2122,17 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2145,7 +2141,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2155,12 +2151,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2173,19 +2167,17 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2198,7 +2190,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2208,12 +2200,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2226,19 +2216,17 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2251,7 +2239,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index cce42592306b71cb6ca513b1e6efacbf29d1f5f6..1bafe7cfc0932be98839f7eee2fc99784dfaa55b 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -2276,18 +2276,16 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ )
-      reduction( result, lhs[ i ] = expression[ i ] );
+      result = reduction( result, lhs[ i ] = expression[ i ] );
    return result;
 }
 
@@ -2295,18 +2293,16 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ )
-      reduction( result, lhs[ i ] = expression[ i ] );
+      result = reduction( result, lhs[ i ] = expression[ i ] );
    return result;
 }
 
@@ -2317,20 +2313,18 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, aux );
+      result = reduction( result, aux );
    }
    return result;
 }
@@ -2339,20 +2333,18 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, aux );
+      result = reduction( result, aux );
    }
    return result;
 }
@@ -2364,20 +2356,18 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, TNL::abs( aux ) );
+      result = reduction( result, TNL::abs( aux ) );
    }
    return result;
 }
@@ -2386,20 +2376,18 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
-   VolatileReduction& volatileReduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, TNL::abs( aux ) );
+      result = reduction( result, TNL::abs( aux ) );
    }
    return result;
 }
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index a1250578084b4f95d6c4de23884d9396fea397b0..29e904bbfafb5c338a523227bb9e226d4fda9970 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -32,9 +32,8 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a < b ? a : b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -45,7 +44,7 @@ auto ExpressionArgMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
+   auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
       if( a > b ) {
          a = b;
          aIdx = bIdx;
@@ -53,15 +52,7 @@ auto ExpressionArgMin( const Expression& expression )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) {
-      if( a > b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -71,9 +62,8 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a > b ? a : b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -84,15 +74,7 @@ auto ExpressionArgMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
-      if( a < b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) {
+   auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
       if( a < b ) {
          a = b;
          aIdx = bIdx;
@@ -100,7 +82,7 @@ auto ExpressionArgMax( const Expression& expression )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -110,9 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -122,9 +102,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -134,9 +112,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression, typename Real >
@@ -146,9 +122,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -158,9 +132,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a *= b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 1 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::multiplies<>{}, fetch, (ResultType) 1 );
 }
 
 template< typename Expression >
@@ -170,9 +142,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a && b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_and<>{}, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -182,9 +152,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a || b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_or<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -194,9 +162,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a & b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_and<>{}, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -206,9 +172,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a | b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_or<>{}, fetch, (ResultType) 0 );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index 7338c8317e4a5b29297240ff40d9de8ae73ff194..a8c626ee5e56925d9e1b9331bde87ddbdeff3a59 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -175,7 +175,7 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end );
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -190,7 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end );
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 1662afb7c815892faf2b336d065a2fdf861dfb70..0d03954549d49773e0c68c685d91fd048b1a7b5a 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
+#include <TNL/Containers/Algorithms/PrefixSum.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 986d2bc0fbde88de1d57670f64a6b6e08a176f10..9cdde8ef2a49422ee858ff87bb5634a8db075460 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -11,8 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/VectorView.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-#include <TNL/Containers/VectorViewExpressions.h>
 #include <TNL/Containers/Algorithms/VectorAssignment.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -127,7 +125,7 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end );
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -141,7 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end );
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h
index 649749f588143ab80793e64c3d93e03bf2147229..40f55711a817e684f379e442ab30cdec485be013 100644
--- a/src/TNL/Devices/Host.h
+++ b/src/TNL/Devices/Host.h
@@ -113,17 +113,19 @@ public:
    }
 
    protected:
-      static bool& ompEnabled() {
+      static bool& ompEnabled()
+      {
 #ifdef HAVE_OPENMP
-         static bool ompEnabled( true );
+         static bool ompEnabled = true;
 #else
-         static bool ompEnabled( false );
+         static bool ompEnabled = false;
 #endif
          return ompEnabled;
       }
 
-      static int& maxThreadsCount() {
-         static int maxThreadsCount( -1 );
+      static int& maxThreadsCount()
+      {
+         static int maxThreadsCount = -1;
          return maxThreadsCount;
       }
 };
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index 1cc6901dc5fd142e53b4f4a1e3f458716bba0602..dd72e2832af81b65b9fdcc8d19090d669232bc62 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -84,9 +84,16 @@ protected:
    void hauseholder_cwy( VectorViewType v,
                          const int i );
 
+// nvcc allows __cuda_callable__ lambdas only in public methods
+#ifdef __NVCC__
+public:
+#endif
    void hauseholder_cwy_transposed( VectorViewType z,
                                     const int i,
                                     ConstVectorViewType w );
+#ifdef __NVCC__
+protected:
+#endif
 
    template< typename Vector >
    void update( const int k,
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index f45741151b03c8fadcd695c09d94abd168c12c7f..7f316d730b1bdad4f5439defd093d6140670c58e 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -15,7 +15,6 @@
 #include <type_traits>
 #include <cmath>
 
-#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/Multireduction.h>
 #include <TNL/Matrices/MatrixOperations.h>
 
@@ -427,14 +426,25 @@ hauseholder_generate( const int i,
    if( i > 0 ) {
       // aux = Y_{i-1}^T * y_i
       RealType aux[ i ];
-      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//      Containers::Algorithms::Multireduction< DeviceType >::reduce
+//               ( scalarProduct,
+//                 i,
+//                 size,
+//                 Y.getData(),
+//                 ldSize,
+//                 Traits::getConstLocalView( y_i ).getData(),
+//                 aux );
+      const RealType* _Y = Y.getData();
+      const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
+      const IndexType ldSize = this->ldSize;
+      auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
       Containers::Algorithms::Multireduction< DeviceType >::reduce
-               ( scalarProduct,
-                 i,
+               ( (RealType) 0,
+                 fetch,
+                 std::plus<>{},
                  size,
-                 Y.getData(),
-                 ldSize,
-                 Traits::getLocalView( y_i ).getData(),
+                 i,
                  aux );
       // no-op if the problem is not distributed
       CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
@@ -525,14 +535,25 @@ hauseholder_cwy_transposed( VectorViewType z,
 {
    // aux = Y_i^T * w
    RealType aux[ i + 1 ];
-   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//   Containers::Algorithms::Multireduction< DeviceType >::reduce
+//            ( scalarProduct,
+//              i + 1,
+//              size,
+//              Y.getData(),
+//              ldSize,
+//              Traits::getConstLocalView( w ).getData(),
+//              aux );
+   const RealType* _Y = Y.getData();
+   const RealType* _w = Traits::getConstLocalView( w ).getData();
+   const IndexType ldSize = this->ldSize;
+   auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
    Containers::Algorithms::Multireduction< DeviceType >::reduce
-            ( scalarProduct,
-              i + 1,
+            ( (RealType) 0,
+              fetch,
+              std::plus<>{},
               size,
-              Y.getData(),
-              ldSize,
-              Traits::getConstLocalView( w ).getData(),
+              i + 1,
               aux );
    // no-op if the problem is not distributed
    Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp
index ccc98daa7ec9eacc4409e6de0688c62e89277cdc..12da6439bd15d4fdbe1e0a088910c940cfc90aa2 100644
--- a/src/TNL/Solvers/ODE/Euler.hpp
+++ b/src/TNL/Solvers/ODE/Euler.hpp
@@ -119,9 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u )
             continue;
          }
       }
-      auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
-      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
-      this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+      this->setResidue( addAndReduceAbs( u, currentTau * k1, std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
       /****
        * When time is close to stopTime the new residue
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index ce21298b0c283e0258421b43325c32191a625816..3c88576e9e583c4c161782f630630573bd23be3a 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -173,16 +173,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          RealType lastResidue = this->getResidue();
          time += currentTau;
 
-         auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
-         auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
          this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
-            reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+            std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
          /////
          // When time is close to stopTime the new residue
          // may be inaccurate significantly.
          if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
-         
 
          if( ! this->nextIteration() )
             return false;
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index e2d07a02aaf2958cc4c2b410c8670a2ac8294650..430a42d5ecb8fee50a5986e20051311f17a30649 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -43,32 +43,26 @@ protected:
    using VectorViewType = typename DistributedVectorType::LocalViewType;
    using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >;
 
-   const int globalSize = 97;  // prime number to force non-uniform distribution
-
    const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
 
-   DistributedVectorType x, y, z;
-
-   DistributedVectorView x_view, y_view, z_view;
+   DistributedVectorType v;
+   DistributedVectorView v_view;
+   typename DistributedVectorType::HostType v_host;
 
    const int rank = CommunicatorType::GetRank(group);
    const int nproc = CommunicatorType::GetSize(group);
 
+   // should be small enough to have fast tests, but large enough to test
+   // prefix-sum with multiple CUDA grids
+   const int globalSize = 10000 * nproc;
+
    DistributedVectorTest()
    {
       using LocalRangeType = typename DistributedVector::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      x.setDistribution( localRange, globalSize, group );
-      y.setDistribution( localRange, globalSize, group );
-      z.setDistribution( localRange, globalSize, group );
-
-      x_view.bind( x );
-      y_view.bind( y );
-      z_view.bind( z );
-
-      setConstantSequence( x, 1 );
-      setLinearSequence( y );
-      setNegativeLinearSequence( z );
+      v.setDistribution( localRange, globalSize, group );
+      v_view.bind( v );
+      setConstantSequence( v, 1 );
    }
 };
 
@@ -85,7 +79,247 @@ using DistributedVectorTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
 
-// TODO: distributed prefix sum
+#if 1
+TYPED_TEST( DistributedVectorTest, prefixSum )
+{
+   using RealType = typename TestFixture::DistributedVectorType::RealType;
+   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
+   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+
+   auto& v = this->v;
+   auto& v_view = this->v_view;
+   auto& v_host = this->v_host;
+   const auto localRange = v.getLocalRange();
+
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+#endif
+
+#if 1
+TYPED_TEST( DistributedVectorTest, exclusivePrefixSum )
+{
+   using RealType = typename TestFixture::DistributedVectorType::RealType;
+   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
+   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+
+   auto& v = this->v;
+   auto& v_view = this->v_view;
+   auto& v_host = this->v_host;
+   const auto localRange = v.getLocalRange();
+
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+#endif
 
 #endif  // HAVE_GTEST
 
diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h
index 0487e916b8080bec0ead292ddb95725d8f84533a..7a321f5836cb2e0b2737c6176eb0b23794c4a501 100644
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
@@ -94,29 +94,45 @@ using VectorTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( MultireductionTest, VectorTypes );
 
-TYPED_TEST( MultireductionTest, scalarProduct )
+// idiot nvcc does not allow __cuda_callable__ lambdas inside private or protected regions
+template< typename DeviceVector, typename HostVector >
+void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVector& result )
 {
-   using RealType = typename TestFixture::DeviceVector::RealType;
-   using DeviceType = typename TestFixture::DeviceVector::DeviceType;
+   using RealType = typename DeviceVector::RealType;
+   using DeviceType = typename DeviceVector::DeviceType;
+   using IndexType = typename DeviceVector::IndexType;
+
+   const RealType* _V = V.getData();
+   const RealType* _y = y.getData();
+   const IndexType size = y.getSize();
+   const int n = result.getSize();
+   ASSERT_EQ( V.getSize(), size * n );
 
-   ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+   auto fetch = [=] __cuda_callable__ ( IndexType i, int k )
+   {
+      TNL_ASSERT_LT( i, size, "BUG: fetcher got invalid index i" );
+      TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" );
+      return _V[ i + k * size ] * _y[ i ];
+   };
    Multireduction< DeviceType >::reduce
-               ( scalarProduct,
-                 this->n,
-                 this->size,
-                 this->V.getData(),
-                 this->size,
-                 this->y.getData(),
-                 this->result.getData() );
-
-   for( int i = 0; i < this->n; i++ ) {
+               ( (RealType) 0,
+                 fetch,
+                 std::plus<>{},
+                 size,
+                 n,
+                 result.getData() );
+
+   for( int i = 0; i < n; i++ ) {
       if( i % 2 == 0 )
-         EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) );
+         EXPECT_EQ( result[ i ], 0.5 * size * ( size - 1 ) );
       else
-         EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) );
+         EXPECT_EQ( result[ i ], - 0.5 * size * ( size - 1 ) );
    }
 }
-
+TYPED_TEST( MultireductionTest, scalarProduct )
+{
+   test_multireduction( this->V, this->y, this->result );
+}
 #endif // HAVE_GTEST
 
 
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index 8fae768849c26406ecabc08c9fcf261e686d9453..93283483c8a06e89c6e9dfceee81f82e391c5955 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace binary_tests {
 
diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
index d8c26b93c7a9da3a67fcb499f342dafa7f2d269d..8c0cd7f90bf20821043c46f4c6d7a9c74fbb511c 100644
--- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
+++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
@@ -22,7 +22,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 using namespace TNL::Arithmetics;
 
 // should be small enough to have fast tests, but larger than minGPUReductionDataSize
@@ -37,9 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return evaluateAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return evaluateAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 TYPED_TEST( VectorTest, evaluateAndReduce )
@@ -74,9 +71,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return addAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 template< typename VectorView >
@@ -85,9 +80,7 @@ performAddAndReduce2( VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return addAndReduce( w, 5.0 * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 733013933014f016f5c0faf18dcc8bdea12bc16f..4659d365d87936e595c7462a7e95096410086e6c 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -20,102 +20,120 @@ constexpr int VECTOR_TEST_SIZE = 10000;
 TYPED_TEST( VectorTest, prefixSum )
 {
    using VectorType = typename TestFixture::VectorType;
-   using VectorOperations = typename TestFixture::VectorOperations;
    using ViewType = typename TestFixture::ViewType;
    using RealType = typename VectorType::RealType;
    using DeviceType = typename VectorType::DeviceType;
    using IndexType = typename VectorType::IndexType;
    const int size = VECTOR_TEST_SIZE;
 
-   if( std::is_same< RealType, float >::value ||
-       std::is_same< IndexType, short >::value )
-   return;
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
 
    VectorType v( size );
    ViewType v_view( v );
    typename VectorType::HostType v_host( size );
 
-   v = 0;
+   setConstantSequence( v, 0 );
    v_host = -1;
    v.prefixSum();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v_view;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v.prefixSum();
    v_host = v;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-   setConstantSequence( v, 1 );
+   // test views
+   setConstantSequence( v, 0 );
    v_host = -1;
    v_view.prefixSum();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v = 0;
+   setConstantSequence( v, 1 );
    v_host = -1;
    v_view.prefixSum();
    v_host = v_view;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v_view.prefixSum();
-   v_host = v_view;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
    ////
    // With CUDA, perform tests with multiple CUDA grids.
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 );
-      v = 0;
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      setConstantSequence( v, 1 );
+      // test views
+      setConstantSequence( v, 0 );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
-      v_host = v_view;
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v = 0;
+      setConstantSequence( v, 1 );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
-      v_host = v_view;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -123,15 +141,14 @@ TYPED_TEST( VectorTest, prefixSum )
 TYPED_TEST( VectorTest, exclusivePrefixSum )
 {
    using VectorType = typename TestFixture::VectorType;
-   using VectorOperations = typename TestFixture::VectorOperations;
    using ViewType = typename TestFixture::ViewType;
    using RealType = typename VectorType::RealType;
    using DeviceType = typename VectorType::DeviceType;
    using IndexType = typename VectorType::IndexType;
    const int size = VECTOR_TEST_SIZE;
 
-   if( std::is_same< RealType, float >::value ||
-       std::is_same< IndexType, short >::value )
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
       return;
 
    VectorType v;
@@ -139,103 +156,106 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
    ViewType v_view( v );
    typename VectorType::HostType v_host( size );
 
-   setConstantSequence( v, 1 );
+   setConstantSequence( v, 0 );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v.setValue( 0 );
+   setConstantSequence( v, 1 );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-   setConstantSequence( v, 1 );
+   // test views
+   setConstantSequence( v, 0 );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v.setValue( 0 );
+   setConstantSequence( v, 1 );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
    ////
    // With CUDA, perform tests with multiple CUDA grids.
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 );
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
-      setConstantSequence( v, 1 );
+      setConstantSequence( v, 0 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v.setValue( 0 );
+      setConstantSequence( v, 1 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      setConstantSequence( v, 1 );
+      // test views
+      setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
-      v_host = v_view;
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v.setValue( 0 );
+      setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
-      v_host = v_view;
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
-      v_host = v_view;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h
index 7338f9637d3178a4d9b567af59e6c201c419f30e..5c342dced87f713824344cb43ee1c9922dbf0ef6 100644
--- a/src/UnitTests/Containers/VectorTestSetup.h
+++ b/src/UnitTests/Containers/VectorTestSetup.h
@@ -8,8 +8,6 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// NOTE: Vector = Array + VectorOperations, so we test Vector and VectorOperations at the same time
-
 #pragma once
 
 #ifdef HAVE_GTEST
@@ -24,7 +22,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 using namespace TNL::Arithmetics;
 
 // test fixture for typed tests
@@ -33,7 +30,6 @@ class VectorTest : public ::testing::Test
 {
 protected:
    using VectorType = Vector;
-   using VectorOperations = Algorithms::VectorOperations< typename VectorType::DeviceType >;
    using ViewType = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >;
 };
 
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index a0aa2695f054295ec83ced0e7b9fa2af70249872..1224042532b2c55eb33757e61fe603219840aa61 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace unary_tests {
 
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index 758ee1d50026dd0c23ccbf51f37c268732e5e16b..93ff286ae65fb5b9dc170903d6aeb16032ebaa69 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace vertical_tests {