From 1a82b047c1efe3a97983fc4f297a310679ef8082 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 09:29:07 +0200
Subject: [PATCH 01/23] Removed reduction and multireduction declarations for
 MIC

They are not implemented anyway...
---
 .../Containers/Algorithms/Multireduction.h    | 20 +------------
 .../Algorithms/Multireduction_impl.h          | 14 ---------
 src/TNL/Containers/Algorithms/Reduction.h     | 30 -------------------
 3 files changed, 1 insertion(+), 63 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 42b8bf28d..6f64d31d5 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -14,16 +14,13 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
 template< typename Device >
-class Multireduction
-{
-};
+class Multireduction;
 
 template<>
 class Multireduction< Devices::Cuda >
@@ -55,21 +52,6 @@ public:
            typename Operation::ResultType* hostResult );
 };
 
-template<>
-class Multireduction< Devices::MIC >
-{
-public:
-   template< typename Operation, typename Index >
-   static void
-   reduce( Operation& operation,
-           const int n,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
-};
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h
index ebd0ad256..3bc0166bd 100644
--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Multireduction_impl.h
@@ -308,20 +308,6 @@ reduce( Operation& operation,
 #endif
 }
 
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::MIC >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const Index ldInput1,
-        const typename Operation::DataType2* input2,
-        typename Operation::ResultType* result )
-{
-   throw Exceptions::NotImplementedError("Multireduction is not implemented for MIC.");
-}
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index e2c6d5295..1f0c10a74 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -16,7 +16,6 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
 
 namespace TNL {
 namespace Containers {
@@ -83,35 +82,6 @@ class Reduction< Devices::Cuda >
                           const Result& zero );
 };
 
-template<>
-class Reduction< Devices::MIC >
-{
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
-
-     template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
-};
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
-- 
GitLab


From e470040a4ffe706ffd31c24c14ca5c6850cc9fb7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 14:58:27 +0200
Subject: [PATCH 02/23] Style changes in the code for reduction

---
 .../Algorithms/CudaReductionKernel.h          | 467 +++++++++---------
 src/TNL/Containers/Algorithms/Reduction.h     |  96 ++--
 src/TNL/Containers/Algorithms/Reduction.hpp   | 150 ++----
 3 files changed, 327 insertions(+), 386 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 89e13a272..76c8f81b3 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -12,20 +12,17 @@
 
 #include <utility>  // std::pair
 
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
-#ifdef HAVE_CUDA
 /****
  * The performance of this kernel is very sensitive to register usage.
  * Compile with --ptxas-options=-v and configure these constants for given
@@ -34,6 +31,7 @@ namespace Algorithms {
 static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
 static constexpr int Reduction_registersPerThread = 32;   // empirically determined optimal value
 
+#ifdef HAVE_CUDA
 // __CUDA_ARCH__ is defined only in device code!
 #if (__CUDA_ARCH__ >= 300 )
    static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
@@ -42,11 +40,11 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
 #endif
 
 template< int blockSize,
-   typename Result,
-   typename DataFetcher,
-   typename Reduction,
-   typename VolatileReduction,
-   typename Index >
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
@@ -56,19 +54,16 @@ CudaReductionKernel( const Result zero,
                      const Index size,
                      Result* output )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
-   const IndexType tid = threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
+   const Index tid = threadIdx.x;
+         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const Index gridSize = blockDim.x * gridDim.x;
 
    sdata[ tid ] = zero;
 
-   // Read data into the shared memory. We start with the sequential reduction.
+   // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSize < size ) {
       reduction( sdata[ tid ], dataFetcher( gid ) );
       reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
@@ -111,28 +106,23 @@ CudaReductionKernel( const Result zero,
 
    // This runs in one warp so it is synchronized implicitly.
    if( tid < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      if( blockSize >= 64 ) {
+      volatile Result* vsdata = sdata;
+      if( blockSize >= 64 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
       // Note that here we do not have to check if tid < 16 etc, because we have
-      // twice as much shared memory, so we do not access out of bounds. The
-      // results for the upper half will be undefined, but unused anyway.
-      if( blockSize >= 32 ) {
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSize >= 32 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSize >= 16 ) {
+      if( blockSize >= 16 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSize >=  8 ) {
+      if( blockSize >=  8 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSize >=  4 ) {
+      if( blockSize >=  4 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSize >=  2 ) {
+      if( blockSize >=  2 )
          volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
    }
 
    // Store the result back in the global memory.
@@ -141,11 +131,11 @@ CudaReductionKernel( const Result zero,
 }
 
 template< int blockSize,
-   typename Result,
-   typename DataFetcher,
-   typename Reduction,
-   typename VolatileReduction,
-   typename Index >
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionWithArgumentKernel( const Result zero,
@@ -157,18 +147,15 @@ CudaReductionWithArgumentKernel( const Result zero,
                                  Index* idxOutput,
                                  const Index* idxInput = nullptr )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
-   IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] );
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+   Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
 
    // Get the thread id (tid), global thread id (gid) and gridSize.
-   const IndexType tid = threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
-   const IndexType gridSize = blockDim.x * gridDim.x;
+   const Index tid = threadIdx.x;
+         Index gid = blockIdx.x * blockDim. x + threadIdx.x;
+   const Index gridSize = blockDim.x * gridDim.x;
 
-   // Read data into the shared memory. We start with the sequential reduction.
+   // Start with the sequential reduction and push the result into the shared memory.
    if( idxInput ) {
       if( gid < size ) {
          sdata[ tid ] = dataFetcher( gid );
@@ -245,29 +232,24 @@ CudaReductionWithArgumentKernel( const Result zero,
 
    // This runs in one warp so it is synchronized implicitly.
    if( tid < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      volatile IndexType* vsidx = sidx;
-      if( blockSize >= 64 ) {
+      volatile Result* vsdata = sdata;
+      volatile Index* vsidx = sidx;
+      if( blockSize >= 64 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
       // Note that here we do not have to check if tid < 16 etc, because we have
-      // twice as much shared memory, so we do not access out of bounds. The
-      // results for the upper half will be undefined, but unused anyway.
-      if( blockSize >= 32 ) {
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSize >= 32 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSize >= 16 ) {
+      if( blockSize >= 16 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSize >=  8 ) {
+      if( blockSize >=  8 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSize >=  4 ) {
+      if( blockSize >=  4 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSize >=  2 ) {
+      if( blockSize >=  2 )
          volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
    }
 
    // Store the result back in the global memory.
@@ -276,15 +258,13 @@ CudaReductionWithArgumentKernel( const Result zero,
       idxOutput[ blockIdx.x ] = sidx[ 0 ];
    }
 }
+#endif
 
 
 template< typename Index,
           typename Result >
 struct CudaReductionKernelLauncher
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -315,13 +295,13 @@ struct CudaReductionKernelLauncher
               const VolatileReduction& volatileReduction,
               const DataFetcher& dataFetcher,
               const Result& zero,
-              ResultType*& output )
+              Result*& output )
    {
       // create reference to the reduction buffer singleton and set size
-      const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType );
+      const std::size_t buf_size = 2 * desGridSize * sizeof( Result );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
-      output = cudaReductionBuffer.template getData< ResultType >();
+      output = cudaReductionBuffer.template getData< Result >();
 
       this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
       return this->reducedSize;
@@ -334,15 +314,15 @@ struct CudaReductionKernelLauncher
                           const VolatileReduction& volatileReduction,
                           const DataFetcher& dataFetcher,
                           const Result& zero,
-                          ResultType*& output,
-                          IndexType*& idxOutput )
+                          Result*& output,
+                          Index*& idxOutput )
    {
       // create reference to the reduction buffer singleton and set size
-      const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) );
+      const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) );
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
       cudaReductionBuffer.setSize( buf_size );
-      output = cudaReductionBuffer.template getData< ResultType >();
-      idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] );
+      output = cudaReductionBuffer.template getData< Result >();
+      idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );
 
       this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
       return this->reducedSize;
@@ -357,13 +337,13 @@ struct CudaReductionKernelLauncher
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
-      ResultType* output = &input[ desGridSize ];
+      Result* input = cudaReductionBuffer.template getData< Result >();
+      Result* output = &input[ desGridSize ];
 
       while( this->reducedSize > 1 )
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
-         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
          this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
          std::swap( input, output );
       }
@@ -373,7 +353,7 @@ struct CudaReductionKernelLauncher
       std::swap( input, output );
 
       // Copy result on CPU
-      ResultType result;
+      Result result;
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
       return result;
    }
@@ -387,15 +367,15 @@ struct CudaReductionKernelLauncher
    {
       // Input is the first half of the buffer, output is the second half
       CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
-      ResultType* output = &input[ desGridSize ];
-      IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] );
-      IndexType* idxOutput = &idxInput[ desGridSize ];
+      Result* input = cudaReductionBuffer.template getData< Result >();
+      Result* output = &input[ desGridSize ];
+      Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] );
+      Index* idxOutput = &idxInput[ desGridSize ];
 
       while( this->reducedSize > 1 )
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
-         auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+         auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
          this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
          std::swap( input, output );
          std::swap( idxInput, idxOutput );
@@ -426,95 +406,99 @@ struct CudaReductionKernelLauncher
                   const Result& zero,
                   Result* output )
       {
+#ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
          gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
-         const IndexType shmem = (blockSize.x <= 32)
-                  ? 2 * blockSize.x * sizeof( ResultType )
-                  : blockSize.x * sizeof( ResultType );
+         const Index shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * sizeof( Result )
+                  : blockSize.x * sizeof( Result );
 
-        // This is "general", but this method always sets blockSize.x to a specific value,
-        // so runtime switch is not necessary - it only prolongs the compilation time.
+         // This is "general", but this method always sets blockSize.x to a specific value,
+         // so runtime switch is not necessary - it only prolongs the compilation time.
 /*
-        // Depending on the blockSize we generate appropriate template instance.
-        switch( blockSize.x )
-        {
-           case 512:
-              CudaReductionKernel< 512 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case 256:
-              cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel< 256 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case 128:
-              cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel< 128 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  64:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  64 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  32:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  32 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case  16:
-              cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<  16 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-          case   8:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   8 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   4:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   4 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   2:
-              cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionKernel<   2 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-              break;
-           case   1:
-              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-           default:
-              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-        }
-        TNL_CHECK_CUDA_DEVICE;
+         // Depending on the blockSize we generate appropriate template instance.
+         switch( blockSize.x )
+         {
+            case 512:
+               CudaReductionKernel< 512 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case 256:
+               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel< 256 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case 128:
+               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel< 128 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case  64:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  64 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case  32:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  32 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case  16:
+               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<  16 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+           case   8:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   8 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case   4:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   4 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case   2:
+               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionKernel<   2 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               break;
+            case   1:
+               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+            default:
+               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+         }
+         TNL_CHECK_CUDA_DEVICE;
 */
 
-        // Check just to future-proof the code setting blockSize.x
-        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-           cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         // Check just to future-proof the code setting blockSize.x
+         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
-           CudaReductionKernel< Reduction_maxThreadsPerBlock >
-           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-        }
-        else {
-           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
-        }
+            CudaReductionKernel< Reduction_maxThreadsPerBlock >
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+         }
+         else {
+            TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+         }
 
-        // Return the size of the output array on the CUDA device
-        return gridSize.x;
+         // Return the size of the output array on the CUDA device
+         return gridSize.x;
+#else
+         throw Exceptions::CudaSupportMissing();
+#endif
       }
 
       template< typename DataFetcher,
@@ -529,105 +513,108 @@ struct CudaReductionKernelLauncher
                               Index* idxOutput,
                               const Index* idxInput )
       {
+#ifdef HAVE_CUDA
          dim3 blockSize, gridSize;
          blockSize.x = Reduction_maxThreadsPerBlock;
          gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
 
          // when there is only one warp per blockSize.x, we need to allocate two warps
          // worth of shared memory so that we don't index shared memory out of bounds
-         const IndexType shmem = (blockSize.x <= 32)
-                  ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) )
-                  : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) );
+         const Index shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) )
+                  : blockSize.x * ( sizeof( Result ) + sizeof( Index ) );
 
-        // This is "general", but this method always sets blockSize.x to a specific value,
-        // so runtime switch is not necessary - it only prolongs the compilation time.
+         // This is "general", but this method always sets blockSize.x to a specific value,
+         // so runtime switch is not necessary - it only prolongs the compilation time.
 /*
-        // Depending on the blockSize we generate appropriate template instance.
-        switch( blockSize.x )
-        {
-           case 512:
-              CudaReductionWithArgumentKernel< 512 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case 256:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel< 256 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case 128:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel< 128 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  64:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  64 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  32:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  32 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case  16:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<  16 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-          case   8:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   8 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   4:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   4 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   2:
-              cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-              CudaReductionWithArgumentKernel<   2 >
-              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-              break;
-           case   1:
-              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-           default:
-              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
-        }
-        TNL_CHECK_CUDA_DEVICE;
+         // Depending on the blockSize we generate appropriate template instance.
+         switch( blockSize.x )
+         {
+            case 512:
+               CudaReductionWithArgumentKernel< 512 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case 256:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel< 256 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case 128:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel< 128 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case  64:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  64 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case  32:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  32 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case  16:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<  16 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+           case   8:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   8 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case   4:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   4 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case   2:
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+               CudaReductionWithArgumentKernel<   2 >
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               break;
+            case   1:
+               TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+            default:
+               TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+         }
+         TNL_CHECK_CUDA_DEVICE;
 */
 
-        // Check just to future-proof the code setting blockSize.x
-        if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-           cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         // Check just to future-proof the code setting blockSize.x
+         if( blockSize.x == Reduction_maxThreadsPerBlock ) {
+            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
-           CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-           <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
-        }
-        else {
-           TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
-        }
+            CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+         }
+         else {
+            TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
+         }
 
-        // return the size of the output array on the CUDA device
-        return gridSize.x;
+         // return the size of the output array on the CUDA device
+         return gridSize.x;
+#else
+         throw Exceptions::CudaSupportMissing();
+#endif
       }
 
 
       const int activeDevice;
       const int blocksdPerMultiprocessor;
       const int desGridSize;
-      const IndexType originalSize;
-      IndexType reducedSize;
+      const Index originalSize;
+      Index reducedSize;
 };
-#endif
 
 } // namespace Algorithms
 } // namespace Containers
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index 1f0c10a74..6e5f9c12a 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -22,64 +22,62 @@ namespace Containers {
 namespace Algorithms {
 
 template< typename Device >
-class Reduction;
+struct Reduction;
 
 template<>
-class Reduction< Devices::Host >
+struct Reduction< Devices::Host >
 {
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static std::pair< Index, Result >
+   reduceWithArgument( const Index size,
+                       ReductionOperation& reduction,
+                       VolatileReductionOperation& volatileReduction,
+                       DataFetcher& dataFetcher,
+                       const Result& zero );
 };
 
 template<>
-class Reduction< Devices::Cuda >
+struct Reduction< Devices::Cuda >
 {
-   public:
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static Result
-      reduce( const Index size,
-              ReductionOperation& reduction,
-              VolatileReductionOperation& volatileReduction,
-              DataFetcher& dataFetcher,
-              const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 
-      template< typename Index,
-                typename Result,
-                typename ReductionOperation,
-                typename VolatileReductionOperation,
-                typename DataFetcher >
-      static std::pair< Index, Result >
-      reduceWithArgument( const Index size,
-                          ReductionOperation& reduction,
-                          VolatileReductionOperation& volatileReduction,
-                          DataFetcher& dataFetcher,
-                          const Result& zero );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static std::pair< Index, Result >
+   reduceWithArgument( const Index size,
+                       ReductionOperation& reduction,
+                       VolatileReductionOperation& volatileReduction,
+                       DataFetcher& dataFetcher,
+                       const Result& zero );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 9d54fd7b1..fd1d781c7 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -12,13 +12,11 @@
 
 #pragma once
 
+#include <memory>  // std::unique_ptr
 
 //#define CUDA_REDUCTION_PROFILING
 
-#include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/CudaReductionKernel.h>
 
@@ -53,24 +51,21 @@ reduce( const Index size,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
    if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
       // global result variable
-      ResultType result = zero;
+      Result result = zero;
 #pragma omp parallel
       {
          // initialize array for thread-local results
-         ResultType r[ 4 ] = { zero, zero, zero, zero  };
+         Result r[ 4 ] = { zero, zero, zero, zero  };
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                reduction( r[ 0 ], dataFetcher( offset + i ) );
                reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
@@ -82,7 +77,7 @@ reduce( const Index size,
          // the first thread that reaches here processes the last, incomplete block
          #pragma omp single nowait
          {
-            for( IndexType i = blocks * block_size; i < size; i++ )
+            for( Index i = blocks * block_size; i < size; i++ )
                reduction( r[ 0 ], dataFetcher( i ) );
          }
 
@@ -103,11 +98,11 @@ reduce( const Index size,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         ResultType r[ 4 ] = { zero, zero, zero, zero };
+         Result r[ 4 ] = { zero, zero, zero, zero };
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                reduction( r[ 0 ], dataFetcher( offset + i ) );
                reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
@@ -117,7 +112,7 @@ reduce( const Index size,
          }
 
          // reduction of the last, incomplete block (not unrolled)
-         for( IndexType i = blocks * block_size; i < size; i++ )
+         for( Index i = blocks * block_size; i < size; i++ )
             reduction( r[ 0 ], dataFetcher( i ) );
             //operation.dataFetcher( r[ 0 ], i, input1, input2 );
 
@@ -128,8 +123,8 @@ reduce( const Index size,
          return r[ 0 ];
       }
       else {
-         ResultType result = zero;
-         for( IndexType i = 0; i < size; i++ )
+         Result result = zero;
+         for( Index i = 0; i < size; i++ )
             reduction( result, dataFetcher( i ) );
          return result;
       }
@@ -151,9 +146,6 @@ reduceWithArgument( const Index size,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
-   using IndexType = Index;
-   using ResultType = Result;
-
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
@@ -164,13 +156,13 @@ reduceWithArgument( const Index size,
 #pragma omp parallel
       {
          // initialize array for thread-local results
-         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
-         ResultType r[ 4 ] = { zero, zero, zero, zero  };
+         Index arg[ 4 ] = { 0, 0, 0, 0 };
+         Result r[ 4 ] = { zero, zero, zero, zero  };
          bool initialized( false );
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                if( ! initialized ) {
                   arg[ 0 ] = offset + i;
@@ -194,7 +186,7 @@ reduceWithArgument( const Index size,
          // the first thread that reaches here processes the last, incomplete block
          #pragma omp single nowait
          {
-            for( IndexType i = blocks * block_size; i < size; i++ )
+            for( Index i = blocks * block_size; i < size; i++ )
                reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
          }
 
@@ -217,13 +209,13 @@ reduceWithArgument( const Index size,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         IndexType arg[ 4 ] = { 0, 0, 0, 0 };
-         ResultType r[ 4 ] = { zero, zero, zero, zero };
+         Index arg[ 4 ] = { 0, 0, 0, 0 };
+         Result r[ 4 ] = { zero, zero, zero, zero };
          bool initialized( false );
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
+            const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
                if( ! initialized )
                {
@@ -246,7 +238,7 @@ reduceWithArgument( const Index size,
          }
 
          // reduction of the last, incomplete block (not unrolled)
-         for( IndexType i = blocks * block_size; i < size; i++ )
+         for( Index i = blocks * block_size; i < size; i++ )
             reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
 
          // reduction of unrolled results
@@ -257,7 +249,7 @@ reduceWithArgument( const Index size,
       }
       else {
          std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
-         for( IndexType i = 1; i < size; i++ )
+         for( Index i = 1; i < size; i++ )
             reduction( result.first, i, result.second, dataFetcher( i ) );
          return result;
       }
@@ -279,18 +271,10 @@ reduce( const Index size,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
-#ifdef HAVE_CUDA
-
-   using IndexType = Index;
-   using ResultType = Result;
-
-   /***
-    * Only fundamental and pointer types can be safely reduced on host. Complex
-    * objects stored on the device might contain pointers into the device memory,
-    * in which case reduction on host might fail.
-    */
-   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
-   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
+   // Only fundamental and pointer types can be safely reduced on host. Complex
+   // objects stored on the device might contain pointers into the device memory,
+   // in which case reduction on host might fail.
+   constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value;
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -298,18 +282,17 @@ reduce( const Index size,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+   CudaReductionKernelLauncher< Index, Result > reductionLauncher( size );
 
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1( 0 );
-   IndexType reducedSize = reductionLauncher.start(
+   // start the reduction on the GPU
+   Result* deviceAux1( 0 );
+   Index reducedSize = reductionLauncher.start(
       reduction,
       volatileReduction,
       dataFetcher,
       zero,
       deviceAux1 );
+
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -318,10 +301,8 @@ reduce( const Index size,
    #endif
 
    if( can_reduce_later_on_host ) {
-      /***
-       * Transfer the reduced data from device to host.
-       */
-      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      // transfer the reduced data from device to host
+      std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -331,11 +312,9 @@ reduce( const Index size,
          timer.start();
       #endif
 
-      /***
-       * Reduce the data on the host system.
-       */
-      auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
+      // finish the reduction on the host
+      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
+      const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -344,9 +323,7 @@ reduce( const Index size,
       return result;
    }
    else {
-      /***
-       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
-       */
+      // data can't be safely reduced on host, so continue with the reduction on the GPU
       auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -358,10 +335,7 @@ reduce( const Index size,
 
       return result;
    }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-};
+}
 
 template< typename Index,
           typename Result,
@@ -376,18 +350,10 @@ reduceWithArgument( const Index size,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
-#ifdef HAVE_CUDA
-
-   using IndexType = Index;
-   using ResultType = Result;
-
-   /***
-    * Only fundamental and pointer types can be safely reduced on host. Complex
-    * objects stored on the device might contain pointers into the device memory,
-    * in which case reduction on host might fail.
-    */
-   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
-   constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
+   // Only fundamental and pointer types can be safely reduced on host. Complex
+   // objects stored on the device might contain pointers into the device memory,
+   // in which case reduction on host might fail.
+   constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value;
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -395,20 +361,19 @@ reduceWithArgument( const Index size,
       timer.start();
    #endif
 
-   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+   CudaReductionKernelLauncher< Index, Result > reductionLauncher( size );
 
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1( nullptr );
-   IndexType* deviceIndexes( nullptr );
-   IndexType reducedSize = reductionLauncher.startWithArgument(
+   // start the reduction on the GPU
+   Result* deviceAux1( nullptr );
+   Index* deviceIndexes( nullptr );
+   Index reducedSize = reductionLauncher.startWithArgument(
       reduction,
       volatileReduction,
       dataFetcher,
       zero,
       deviceAux1,
       deviceIndexes );
+
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -417,11 +382,9 @@ reduceWithArgument( const Index size,
    #endif
 
    if( can_reduce_later_on_host ) {
-      /***
-       * Transfer the reduced data from device to host.
-       */
-      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
-      std::unique_ptr< IndexType[] > indexArray{ new IndexType[ reducedSize ] };
+      // transfer the reduced data from device to host
+      std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] };
+      std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );
 
@@ -432,12 +395,10 @@ reduceWithArgument( const Index size,
          timer.start();
       #endif
 
-      /***
-       * Reduce the data on the host system.
-       */
-      //auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      //const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero );
-      for( IndexType i = 1; i < reducedSize; i++ )
+      // finish the reduction on the host
+//      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
+//      const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero );
+      for( Index i = 1; i < reducedSize; i++ )
          reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -447,9 +408,7 @@ reduceWithArgument( const Index size,
       return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] );
    }
    else {
-      /***
-       * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
-       */
+      // data can't be safely reduced on host, so continue with the reduction on the GPU
       auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -461,9 +420,6 @@ reduceWithArgument( const Index size,
 
       return result;
    }
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
 }
 
 } // namespace Algorithms
-- 
GitLab


From b74a24d22e012ce1a25ad89ed906482a23588540 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkjak@fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 15:13:16 +0200
Subject: [PATCH 03/23] Rewritten multireduction with lambda functions

---
 .../Algorithms/CudaMultireductionKernel.h     | 249 ++++++--------
 .../Containers/Algorithms/Multireduction.h    |  70 ++--
 .../Containers/Algorithms/Multireduction.hpp  | 231 +++++++++++++
 .../Algorithms/Multireduction_impl.h          | 313 ------------------
 src/TNL/Solvers/Linear/GMRES.h                |   7 +
 src/TNL/Solvers/Linear/GMRES_impl.h           |  53 ++-
 src/UnitTests/Containers/MultireductionTest.h |  51 ++-
 7 files changed, 471 insertions(+), 503 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/Multireduction.hpp
 delete mode 100644 src/TNL/Containers/Algorithms/Multireduction_impl.h

diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index e8fc7f8bb..47919b351 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -12,14 +12,11 @@
 
 #pragma once
 
-#ifdef HAVE_CUDA
-#include <cuda.h>
-#endif
-
 #include <TNL/Assert.h>
 #include <TNL/Math.h>
 #include <TNL/Devices/CudaDeviceInfo.h>
 #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 
 namespace TNL {
 namespace Containers {
@@ -32,157 +29,128 @@ namespace Algorithms {
  * architecture so that there are no local memory spills.
  */
 static constexpr int Multireduction_maxThreadsPerBlock = 256;  // must be a power of 2
-static constexpr int Multireduction_registersPerThread = 38;   // empirically determined optimal value
+static constexpr int Multireduction_registersPerThread = 32;   // empirically determined optimal value
 
 // __CUDA_ARCH__ is defined only in device code!
 #if (__CUDA_ARCH__ >= 300 )
-   static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
+   static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
 #else
    static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
 #endif
 
-template< int blockSizeX, typename Operation, typename Index >
+template< int blockSizeX,
+          typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
-CudaMultireductionKernel( Operation operation,
-                          const int n,
+CudaMultireductionKernel( const Result zero,
+                          DataFetcher dataFetcher,
+                          const Reduction reduction,
+                          const VolatileReduction volatileReduction,
                           const Index size,
-                          const typename Operation::DataType1* input1,
-                          const Index ldInput1,
-                          const typename Operation::DataType2* input2,
-                          typename Operation::ResultType* output )
+                          const int n,
+                          Result* output )
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
-
-   ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
-
-   /***
-    * Get thread id (tid) and global element id (gid).
-    * gridSizeX is the number of elements in the direction of x-axis
-    * processed by all blocks at the same time.
-    */
-   const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x;
-         IndexType gid = blockIdx.x * blockDim.x + threadIdx.x;
-   const IndexType gridSizeX = blockDim.x * gridDim.x;
-
-   /***
-    * Shift input1 and output pointers.
-    */
-   const IndexType y = blockIdx.y * blockDim.y + threadIdx.y;
-   if( y < n ) {
-      input1 += y * ldInput1;
-      output += y * gridDim.x;
-   }
-   else
-      return;
-
-   /***
-    * Start with the sequential reduction and push the
-    * result into the shared memory.
-    */
-   sdata[ tid ] = operation.initialValue();
-   while( gid + 4 * gridSizeX < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 );
+   Result* sdata = Devices::Cuda::getSharedMemory< Result >();
+
+   // Get the thread id (tid), global thread id (gid) and gridSize.
+   const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
+         Index gid = blockIdx.x * blockDim.x + threadIdx.x;
+   const Index gridSizeX = blockDim.x * gridDim.x;
+
+   // Get the dataset index.
+   const int y = blockIdx.y * blockDim.y + threadIdx.y;
+   if( y >= n ) return;
+
+   sdata[ tid ] = zero;
+
+   // Start with the sequential reduction and push the result into the shared memory.
+   while( gid + 4 * gridSizeX < size ) {
+      reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
       gid += 4 * gridSizeX;
    }
-   while( gid + 2 * gridSizeX < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSizeX,     input1, input2 );
+   while( gid + 2 * gridSizeX < size ) {
+      reduction( sdata[ tid ], dataFetcher( gid, y ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
       gid += 2 * gridSizeX;
    }
-   while( gid < size )
-   {
-      operation.firstReduction( sdata[ tid ], gid,                 input1, input2 );
+   while( gid < size ) {
+      reduction( sdata[ tid ], dataFetcher( gid, y ) );
       gid += gridSizeX;
    }
    __syncthreads();
 
-
-   //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   /***
-    *  Perform the parallel reduction.
-    */
+   // Perform the parallel reduction.
    if( blockSizeX >= 1024 ) {
-      if( threadIdx.x < 512 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
-      }
+      if( threadIdx.x < 512 )
+         reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSizeX >= 512 ) {
-      if( threadIdx.x < 256 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
-      }
+      if( threadIdx.x < 256 )
+         reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSizeX >= 256 ) {
-      if( threadIdx.x < 128 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
-      }
+      if( threadIdx.x < 128 )
+         reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
    }
    if( blockSizeX >= 128 ) {
-      if( threadIdx.x <  64 ) {
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
-      }
+      if( threadIdx.x <  64 )
+         reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
    }
 
-   /***
-    * This runs in one warp so it is synchronized implicitly.
-    *
-    * When the blockSizeX is less then or equal to the warp size, the shared memory
-    * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory
-    * will be accessed !!!
-    */
+   // This runs in one warp so it is synchronized implicitly.
    if( threadIdx.x < 32 ) {
-      volatile ResultType* vsdata = sdata;
-      if( blockSizeX >= 64 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
-      }
-      if( blockSizeX >= 32 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
-      }
-      if( blockSizeX >= 16 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
-      }
-      if( blockSizeX >=  8 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
-      }
-      if( blockSizeX >=  4 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
-      }
-      if( blockSizeX >=  2 ) {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
-      }
+      volatile Result* vsdata = sdata;
+      if( blockSizeX >= 64 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+      // Note that here we do not have to check if tid < 16 etc, because we have
+      // 2 * blockSize.x elements of shared memory per block, so we do not
+      // access out of bounds. The results for the upper half will be undefined,
+      // but unused anyway.
+      if( blockSizeX >= 32 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+      if( blockSizeX >= 16 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+      if( blockSizeX >=  8 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+      if( blockSizeX >=  4 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+      if( blockSizeX >=  2 )
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
    }
 
-   /***
-    * Store the result back in the global memory.
-    */
+   // Store the result back in the global memory.
    if( threadIdx.x == 0 ) {
-      output[ blockIdx.x ] = sdata[ tid ];
+      output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ];
    }
 }
+#endif
 
-template< typename Operation, typename Index >
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
 int
-CudaMultireductionKernelLauncher( Operation& operation,
-                                  const int n,
+CudaMultireductionKernelLauncher( const Result zero,
+                                  DataFetcher dataFetcher,
+                                  const Reduction reduction,
+                                  const VolatileReduction volatileReduction,
                                   const Index size,
-                                  const typename Operation::DataType1* input1,
-                                  const Index ldInput1,
-                                  const typename Operation::DataType2* input2,
-                                  typename Operation::ResultType*& output )
+                                  const int n,
+                                  Result*& output )
 {
-   typedef typename Operation::ResultType ResultType;
-
+#ifdef HAVE_CUDA
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -197,7 +165,7 @@ CudaMultireductionKernelLauncher( Operation& operation,
                                       / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
    const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
    dim3 blockSize, gridSize;
-   
+
    // version A: max 16 rows of threads
    blockSize.y = TNL::min( n, 16 );
 
@@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation,
 
    // create reference to the reduction buffer singleton and set size
    // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time)
-   const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType );
+   const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result );
    CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
    cudaReductionBuffer.setSize( buf_size );
-   output = cudaReductionBuffer.template getData< ResultType >();
+   output = cudaReductionBuffer.template getData< Result >();
 
    // when there is only one warp per blockSize.x, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    const Index shmem = (blockSize.x <= 32)
-            ? 2 * blockSize.x * blockSize.y * sizeof( ResultType )
-            : blockSize.x * blockSize.y * sizeof( ResultType );
-
-   //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem <<std::endl;
+            ? 2 * blockSize.x * blockSize.y * sizeof( Result )
+            : blockSize.x * blockSize.y * sizeof( Result );
 
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
+   // Depending on the blockSize we generate appropriate template instance.
    switch( blockSize.x )
    {
       case 512:
          CudaMultireductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   4 >
-        <<< gridSize, blockSize, shmem >>>( operation,  n,size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
         break;
       case   2:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
          break;
       case   1:
          throw std::logic_error( "blockSize should not be 1." );
       default:
          throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
    }
+   cudaStreamSynchronize(0);
    TNL_CHECK_CUDA_DEVICE;
 
    // return the size of the output array on the CUDA device
    return gridSize.x;
-}
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
+}
 
 } // namespace Algorithms
 } // namespace Containers
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 6f64d31d5..00ca6078a 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -20,40 +20,70 @@ namespace Containers {
 namespace Algorithms {
 
 template< typename Device >
-class Multireduction;
+struct Multireduction;
 
 template<>
-class Multireduction< Devices::Cuda >
+struct Multireduction< Devices::Host >
 {
-public:
-   template< typename Operation, typename Index >
+   /**
+    * Parameters:
+    *    zero: starting value for reduction
+    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
+    *                 the i-th value to be reduced from the j-th dataset
+    *                 (i = 0,...,size-1; j = 0,...,n-1)
+    *    reduction: callable object representing the reduction operation
+    *    volatileReduction: callable object representing the reduction operation
+    *    size: the size of each dataset
+    *    n: number of datasets to be reduced
+    *    result: output array of size = n
+    */
+   template< typename Result,
+             typename DataFetcher,
+             typename Reduction,
+             typename VolatileReduction,
+             typename Index >
    static void
-   reduce( Operation& operation,
-           const int n,
+   reduce( const Result zero,
+           DataFetcher dataFetcher,
+           const Reduction reduction,
+           const VolatileReduction volatileReduction,
            const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
+           const int n,
+           Result* result );
 };
 
 template<>
-class Multireduction< Devices::Host >
+struct Multireduction< Devices::Cuda >
 {
-public:
-   template< typename Operation, typename Index >
+   /**
+    * Parameters:
+    *    zero: starting value for reduction
+    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
+    *                 the i-th value to be reduced from the j-th dataset
+    *                 (i = 0,...,size-1; j = 0,...,n-1)
+    *    reduction: callable object representing the reduction operation
+    *    volatileReduction: callable object representing the reduction operation
+    *    size: the size of each dataset
+    *    n: number of datasets to be reduced
+    *    hostResult: output array of size = n
+    */
+   template< typename Result,
+             typename DataFetcher,
+             typename Reduction,
+             typename VolatileReduction,
+             typename Index >
    static void
-   reduce( Operation& operation,
-           const int n,
+   reduce( const Result zero,
+           DataFetcher dataFetcher,
+           const Reduction reduction,
+           const VolatileReduction volatileReduction,
            const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const Index ldInput1,
-           const typename Operation::DataType2* deviceInput2,
-           typename Operation::ResultType* hostResult );
+           const int n,
+           Result* hostResult );
 };
 
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
-#include "Multireduction_impl.h"
+#include "Multireduction.hpp"
diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
new file mode 100644
index 000000000..f5c193c42
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -0,0 +1,231 @@
+/***************************************************************************
+                          Multireduction_impl.h  -  description
+                             -------------------
+    begin                : May 13, 2016
+    copyright            : (C) 2016 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <memory>  // std::unique_ptr
+
+//#define CUDA_REDUCTION_PROFILING
+
+#include <TNL/Assert.h>
+#include <TNL/Containers/Algorithms/Multireduction.h>
+#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
+
+#ifdef CUDA_REDUCTION_PROFILING
+#include <TNL/Timer.h>
+#include <iostream>
+#endif
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
+void
+Multireduction< Devices::Host >::
+reduce( const Result zero,
+        DataFetcher dataFetcher,
+        const Reduction reduction,
+        const VolatileReduction volatileReduction,
+        const Index size,
+        const int n,
+        Result* result )
+{
+   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
+   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
+
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+#ifdef HAVE_OPENMP
+   if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
+#pragma omp parallel
+   {
+      // first thread initializes the result array
+      #pragma omp single nowait
+      {
+         for( int k = 0; k < n; k++ )
+            result[ k ] = zero;
+      }
+
+      // initialize array for thread-local results
+      // (it is accessed as a row-major matrix with n rows and 4 columns)
+      Result r[ n * 4 ];
+      for( int k = 0; k < n * 4; k++ )
+         r[ k ] = zero;
+
+      #pragma omp for nowait
+      for( int b = 0; b < blocks; b++ ) {
+         const Index offset = b * block_size;
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            for( int i = 0; i < block_size; i += 4 ) {
+               reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+               reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+               reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+               reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+            }
+         }
+      }
+
+      // the first thread that reaches here processes the last, incomplete block
+      #pragma omp single nowait
+      {
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            for( Index i = blocks * block_size; i < size; i++ )
+               reduction( _r[ 0 ], dataFetcher( i, k ) );
+         }
+      }
+
+      // local reduction of unrolled results
+      for( int k = 0; k < n; k++ ) {
+         Result* _r = r + 4 * k;
+         reduction( _r[ 0 ], _r[ 1 ] );
+         reduction( _r[ 0 ], _r[ 2 ] );
+         reduction( _r[ 0 ], _r[ 3 ] );
+      }
+
+      // inter-thread reduction of local results
+      #pragma omp critical
+      {
+         for( int k = 0; k < n; k++ )
+            reduction( result[ k ], r[ 4 * k ] );
+      }
+   }
+   else {
+#endif
+      if( blocks > 1 ) {
+         // initialize array for unrolled results
+         // (it is accessed as a row-major matrix with n rows and 4 columns)
+         Result r[ n * 4 ];
+         for( int k = 0; k < n * 4; k++ )
+            r[ k ] = zero;
+
+         // main reduction (explicitly unrolled loop)
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( int i = 0; i < block_size; i += 4 ) {
+                  reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+                  reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+                  reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+                  reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+               }
+            }
+         }
+
+         // reduction of the last, incomplete block (not unrolled)
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            for( Index i = blocks * block_size; i < size; i++ )
+               reduction( _r[ 0 ], dataFetcher( i, k ) );
+         }
+
+         // reduction of unrolled results
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            reduction( _r[ 0 ], _r[ 1 ] );
+            reduction( _r[ 0 ], _r[ 2 ] );
+            reduction( _r[ 0 ], _r[ 3 ] );
+
+            // copy the result into the output parameter
+            result[ k ] = _r[ 0 ];
+         }
+      }
+      else {
+         for( int k = 0; k < n; k++ )
+            result[ k ] = zero;
+
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               for( int i = 0; i < block_size; i++ )
+                  reduction( result[ k ], dataFetcher( offset + i, k ) );
+            }
+         }
+
+         for( int k = 0; k < n; k++ ) {
+            for( Index i = blocks * block_size; i < size; i++ )
+               reduction( result[ k ], dataFetcher( i, k ) );
+         }
+      }
+#ifdef HAVE_OPENMP
+   }
+#endif
+}
+
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename VolatileReduction,
+          typename Index >
+void
+Multireduction< Devices::Cuda >::
+reduce( const Result zero,
+        DataFetcher dataFetcher,
+        const Reduction reduction,
+        const VolatileReduction volatileReduction,
+        const Index size,
+        const int n,
+        Result* hostResult )
+{
+   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
+   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      Timer timer;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // start the reduction on the GPU
+   Result* deviceAux1 = nullptr;
+   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // transfer the reduced data from device to host
+   std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
+   ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
+      timer.reset();
+      timer.start();
+   #endif
+
+   // finish the reduction on the host
+   auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
+   Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult );
+
+   #ifdef CUDA_REDUCTION_PROFILING
+      timer.stop();
+      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
+   #endif
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h
deleted file mode 100644
index 3bc0166bd..000000000
--- a/src/TNL/Containers/Algorithms/Multireduction_impl.h
+++ /dev/null
@@ -1,313 +0,0 @@
-/***************************************************************************
-                          Multireduction_impl.h  -  description
-                             -------------------
-    begin                : May 13, 2016
-    copyright            : (C) 2016 by Tomas Oberhuber et al.
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-// Implemented by: Jakub Klinkovsky
-
-#pragma once
-
-#include "Multireduction.h"
-
-//#define CUDA_REDUCTION_PROFILING
-
-#include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Exceptions/NotImplementedError.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
-#include <TNL/Containers/Algorithms/ArrayOperations.h>
-#include <TNL/Containers/Algorithms/CudaMultireductionKernel.h>
-
-#ifdef CUDA_REDUCTION_PROFILING
-#include <TNL/Timer.h>
-#include <iostream>
-#endif
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-/****
- * Arrays smaller than the following constant are reduced on CPU.
- */
-//static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256;
-// TODO: benchmarks with different values
-static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
-
-/*
- * Parameters:
- *    operation: the operation used for reduction
- *    n: number of datasets to be reduced
- *    size: the size of each dataset
- *    deviceInput1: input array of size = n * ldInput1
- *    ldInput1: leading dimension of the deviceInput1 array
- *    deviceInput2: either nullptr or input array of size = size
- *    hostResult: output array of size = n
- */
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::Cuda >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* deviceInput1,
-        const Index ldInput1,
-        const typename Operation::DataType2* deviceInput2,
-        typename Operation::ResultType* hostResult )
-{
-#ifdef HAVE_CUDA
-   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
-   TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." );
-
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
-
-   /***
-    * First check if the input array(s) is/are large enough for the multireduction on GPU.
-    * Otherwise copy it/them to host and multireduce on CPU.
-    */
-   if( n * ldInput1 < Multireduction_minGpuDataSize ) {
-      DataType1 hostArray1[ Multireduction_minGpuDataSize ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray1, deviceInput1, n * ldInput1 );
-      if( deviceInput2 ) {
-         using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
-         _DT2 hostArray2[ Multireduction_minGpuDataSize ];
-         ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray2, (_DT2*) deviceInput2, size );
-         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult );
-      }
-      else {
-         Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult );
-      }
-      return;
-   }
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      Timer timer;
-      timer.reset();
-      timer.start();
-   #endif
-
-   /****
-    * Reduce the data on the CUDA device.
-    */
-   ResultType* deviceAux1 = nullptr;
-   const IndexType reducedSize = CudaMultireductionKernelLauncher( operation,
-                                                                   n,
-                                                                   size,
-                                                                   deviceInput1,
-                                                                   ldInput1,
-                                                                   deviceInput2,
-                                                                   deviceAux1 );
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
-      timer.reset();
-      timer.start();
-   #endif
-
-   /***
-    * Transfer the reduced data from device to host.
-    */
-   ResultType resultArray[ n * reducedSize ];
-   ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray, deviceAux1, n * reducedSize );
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl;
-      timer.reset();
-      timer.start();
-   #endif
-
-//   std::cout << "resultArray = [";
-//   for( int i = 0; i < n * reducedSize; i++ ) {
-//      std::cout << resultArray[ i ];
-//      if( i < n * reducedSize - 1 )
-//         std::cout << ", ";
-//   }
-//   std::cout << "]" << std::endl;
-
-   /***
-    * Reduce the data on the host system.
-    */
-   LaterReductionOperation laterReductionOperation;
-   Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (void*) nullptr, hostResult );
-
-   #ifdef CUDA_REDUCTION_PROFILING
-      timer.stop();
-      std::cout << "   Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
-   #endif
-
-   TNL_CHECK_CUDA_DEVICE;
-#else
-   throw Exceptions::CudaSupportMissing();
-#endif
-};
-
-/*
- * Parameters:
- *    operation: the operation used for reduction
- *    n: number of datasets to be reduced
- *    size: the size of each dataset
- *    input1: input array of size = n * ldInput1
- *    ldInput1: leading dimension of the input1 array
- *    input2: either nullptr or input array of size = size
- *    hostResult: output array of size = n
- */
-template< typename Operation, typename Index >
-void
-Multireduction< Devices::Host >::
-reduce( Operation& operation,
-        const int n,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const Index ldInput1,
-        const typename Operation::DataType2* input2,
-        typename Operation::ResultType* result )
-{
-   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
-   TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." );
-
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-
-   constexpr int block_size = 128;
-   const int blocks = size / block_size;
-
-#ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
-#pragma omp parallel
-   {
-      // first thread initializes the result array
-      #pragma omp single nowait
-      {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = operation.initialValue();
-      }
-
-      // initialize array for thread-local results
-      // (it is accessed as a row-major matrix with n rows and 4 columns)
-      ResultType r[ n * 4 ];
-      for( int k = 0; k < n * 4; k++ )
-         r[ k ] = operation.initialValue();
-
-      #pragma omp for nowait
-      for( int b = 0; b < blocks; b++ ) {
-         const IndexType offset = b * block_size;
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
-               operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
-               operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
-               operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
-            }
-         }
-      }
-
-      // the first thread that reaches here processes the last, incomplete block
-      #pragma omp single nowait
-      {
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
-         }
-      }
-
-      // local reduction of unrolled results
-      for( int k = 0; k < n; k++ ) {
-         ResultType* _r = r + 4 * k;
-         operation.commonReduction( _r[ 0 ], _r[ 1 ] );
-         operation.commonReduction( _r[ 0 ], _r[ 2 ] );
-         operation.commonReduction( _r[ 0 ], _r[ 3 ] );
-      }
-
-      // inter-thread reduction of local results
-      #pragma omp critical
-      {
-         for( int k = 0; k < n; k++ )
-            operation.commonReduction( result[ k ], r[ 4 * k ] );
-      }
-   }
-   else {
-#endif
-      if( blocks > 1 ) {
-         // initialize array for unrolled results
-         // (it is accessed as a row-major matrix with n rows and 4 columns)
-         ResultType r[ n * 4 ];
-         for( int k = 0; k < n * 4; k++ )
-            r[ k ] = operation.initialValue();
-
-         // main reduction (explicitly unrolled loop)
-         for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               const DataType1* _input1 = input1 + k * ldInput1;
-               ResultType* _r = r + 4 * k;
-               for( int i = 0; i < block_size; i += 4 ) {
-                  operation.firstReduction( _r[ 0 ], offset + i,     _input1, input2 );
-                  operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 );
-                  operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 );
-                  operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 );
-               }
-            }
-         }
-
-         // reduction of the last, incomplete block (not unrolled)
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            ResultType* _r = r + 4 * k;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( _r[ 0 ], i, _input1, input2 );
-         }
-
-         // reduction of unrolled results
-         for( int k = 0; k < n; k++ ) {
-            ResultType* _r = r + 4 * k;
-            operation.commonReduction( _r[ 0 ], _r[ 1 ] );
-            operation.commonReduction( _r[ 0 ], _r[ 2 ] );
-            operation.commonReduction( _r[ 0 ], _r[ 3 ] );
-
-            // copy the result into the output parameter
-            result[ k ] = _r[ 0 ];
-         }
-      }
-      else {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = operation.initialValue();
-
-         for( int b = 0; b < blocks; b++ ) {
-            const IndexType offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               const DataType1* _input1 = input1 + k * ldInput1;
-               for( int i = 0; i < block_size; i++ )
-                  operation.firstReduction( result[ k ], offset + i, _input1, input2 );
-            }
-         }
-
-         for( int k = 0; k < n; k++ ) {
-            const DataType1* _input1 = input1 + k * ldInput1;
-            for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( result[ k ], i, _input1, input2 );
-         }
-      }
-#ifdef HAVE_OPENMP
-   }
-#endif
-}
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h
index 1cc6901dc..dd72e2832 100644
--- a/src/TNL/Solvers/Linear/GMRES.h
+++ b/src/TNL/Solvers/Linear/GMRES.h
@@ -84,9 +84,16 @@ protected:
    void hauseholder_cwy( VectorViewType v,
                          const int i );
 
+// nvcc allows __cuda_callable__ lambdas only in public methods
+#ifdef __NVCC__
+public:
+#endif
    void hauseholder_cwy_transposed( VectorViewType z,
                                     const int i,
                                     ConstVectorViewType w );
+#ifdef __NVCC__
+protected:
+#endif
 
    template< typename Vector >
    void update( const int k,
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index f45741151..d1ef7bc97 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -15,7 +15,6 @@
 #include <type_traits>
 #include <cmath>
 
-#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/Multireduction.h>
 #include <TNL/Matrices/MatrixOperations.h>
 
@@ -427,14 +426,28 @@ hauseholder_generate( const int i,
    if( i > 0 ) {
       // aux = Y_{i-1}^T * y_i
       RealType aux[ i ];
-      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//      Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//      Containers::Algorithms::Multireduction< DeviceType >::reduce
+//               ( scalarProduct,
+//                 i,
+//                 size,
+//                 Y.getData(),
+//                 ldSize,
+//                 Traits::getConstLocalView( y_i ).getData(),
+//                 aux );
+      const RealType* _Y = Y.getData();
+      const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
+      const IndexType ldSize = this->ldSize;
+      auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
+      auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
       Containers::Algorithms::Multireduction< DeviceType >::reduce
-               ( scalarProduct,
-                 i,
+               ( (RealType) 0,
+                 fetch,
+                 reduction,
+                 volatileReduction,
                  size,
-                 Y.getData(),
-                 ldSize,
-                 Traits::getLocalView( y_i ).getData(),
+                 i,
                  aux );
       // no-op if the problem is not distributed
       CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
@@ -525,14 +538,28 @@ hauseholder_cwy_transposed( VectorViewType z,
 {
    // aux = Y_i^T * w
    RealType aux[ i + 1 ];
-   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//   Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+//   Containers::Algorithms::Multireduction< DeviceType >::reduce
+//            ( scalarProduct,
+//              i + 1,
+//              size,
+//              Y.getData(),
+//              ldSize,
+//              Traits::getConstLocalView( w ).getData(),
+//              aux );
+   const RealType* _Y = Y.getData();
+   const RealType* _w = Traits::getConstLocalView( w ).getData();
+   const IndexType ldSize = this->ldSize;
+   auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
+   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
    Containers::Algorithms::Multireduction< DeviceType >::reduce
-            ( scalarProduct,
-              i + 1,
+            ( (RealType) 0,
+              fetch,
+              reduction,
+              volatileReduction,
               size,
-              Y.getData(),
-              ldSize,
-              Traits::getConstLocalView( w ).getData(),
+              i + 1,
               aux );
    // no-op if the problem is not distributed
    Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) );
diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h
index 0487e916b..29d2800f3 100644
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
@@ -94,29 +94,48 @@ using VectorTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( MultireductionTest, VectorTypes );
 
-TYPED_TEST( MultireductionTest, scalarProduct )
+// idiot nvcc does not allow __cuda_callable__ lambdas inside private or protected regions
+template< typename DeviceVector, typename HostVector >
+void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVector& result )
 {
-   using RealType = typename TestFixture::DeviceVector::RealType;
-   using DeviceType = typename TestFixture::DeviceVector::DeviceType;
+   using RealType = typename DeviceVector::RealType;
+   using DeviceType = typename DeviceVector::DeviceType;
+   using IndexType = typename DeviceVector::IndexType;
+
+   const RealType* _V = V.getData();
+   const RealType* _y = y.getData();
+   const IndexType size = y.getSize();
+   const int n = result.getSize();
+   ASSERT_EQ( V.getSize(), size * n );
 
-   ParallelReductionScalarProduct< RealType, RealType > scalarProduct;
+   auto fetch = [=] __cuda_callable__ ( IndexType i, int k )
+   {
+      TNL_ASSERT_LT( i, size, "BUG: fetcher got invalid index i" );
+      TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" );
+      return _V[ i + k * size ] * _y[ i ];
+   };
+   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
    Multireduction< DeviceType >::reduce
-               ( scalarProduct,
-                 this->n,
-                 this->size,
-                 this->V.getData(),
-                 this->size,
-                 this->y.getData(),
-                 this->result.getData() );
-
-   for( int i = 0; i < this->n; i++ ) {
+               ( (RealType) 0,
+                 fetch,
+                 reduction,
+                 volatileReduction,
+                 size,
+                 n,
+                 result.getData() );
+
+   for( int i = 0; i < n; i++ ) {
       if( i % 2 == 0 )
-         EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) );
+         EXPECT_EQ( result[ i ], 0.5 * size * ( size - 1 ) );
       else
-         EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) );
+         EXPECT_EQ( result[ i ], - 0.5 * size * ( size - 1 ) );
    }
 }
-
+TYPED_TEST( MultireductionTest, scalarProduct )
+{
+   test_multireduction( this->V, this->y, this->result );
+}
 #endif // HAVE_GTEST
 
 
-- 
GitLab


From cbc2fff9dc7be94518bb2a9872fa5f43b3134148 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 18:57:47 +0200
Subject: [PATCH 04/23] Found a way to avoid using volatile in CUDA reduction:
 __syncwarp()

The performance seems to be identical to the code using volatile.
---
 .../Algorithms/CudaMultireductionKernel.h     |  58 +++----
 .../Algorithms/CudaReductionKernel.h          | 153 +++++++++---------
 .../Containers/Algorithms/Multireduction.hpp  |   2 +-
 src/TNL/Containers/Algorithms/Reduction.hpp   |   6 +-
 4 files changed, 104 insertions(+), 115 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index 47919b351..782e66eaa 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -42,14 +42,12 @@ template< int blockSizeX,
           typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 __global__ void
 __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
 CudaMultireductionKernel( const Result zero,
                           DataFetcher dataFetcher,
                           const Reduction reduction,
-                          const VolatileReduction volatileReduction,
                           const Index size,
                           const int n,
                           Result* output )
@@ -108,25 +106,29 @@ CudaMultireductionKernel( const Result zero,
       __syncthreads();
    }
 
-   // This runs in one warp so it is synchronized implicitly.
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( threadIdx.x < 32 ) {
-      volatile Result* vsdata = sdata;
       if( blockSizeX >= 64 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         reduction( sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
       // 2 * blockSize.x elements of shared memory per block, so we do not
       // access out of bounds. The results for the upper half will be undefined,
       // but unused anyway.
       if( blockSizeX >= 32 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         reduction( sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
       if( blockSizeX >= 16 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         reduction( sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
       if( blockSizeX >=  8 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         reduction( sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
       if( blockSizeX >=  4 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         reduction( sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
       if( blockSizeX >=  2 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
@@ -139,13 +141,11 @@ CudaMultireductionKernel( const Result zero,
 template< typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 int
 CudaMultireductionKernelLauncher( const Result zero,
                                   DataFetcher dataFetcher,
                                   const Reduction reduction,
-                                  const VolatileReduction volatileReduction,
                                   const Index size,
                                   const int n,
                                   Result*& output )
@@ -215,55 +215,55 @@ CudaMultireductionKernelLauncher( const Result zero,
    {
       case 512:
          CudaMultireductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
         break;
       case   2:
-         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaMultireductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaMultireductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, n, output );
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output );
          break;
       case   1:
          throw std::logic_error( "blockSize should not be 1." );
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 76c8f81b3..03da1e556 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -43,14 +43,12 @@ template< int blockSize,
           typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
                      const DataFetcher dataFetcher,
                      const Reduction reduction,
-                     const VolatileReduction volatileReduction,
                      const Index size,
                      Result* output )
 {
@@ -104,25 +102,29 @@ CudaReductionKernel( const Result zero,
       __syncthreads();
    }
 
-   // This runs in one warp so it is synchronized implicitly.
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( tid < 32 ) {
-      volatile Result* vsdata = sdata;
       if( blockSize >= 64 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         reduction( sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
       // 2 * blockSize.x elements of shared memory per block, so we do not
       // access out of bounds. The results for the upper half will be undefined,
       // but unused anyway.
       if( blockSize >= 32 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         reduction( sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
       if( blockSize >= 16 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         reduction( sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
       if( blockSize >=  8 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         reduction( sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
       if( blockSize >=  4 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         reduction( sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
       if( blockSize >=  2 )
-         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
@@ -134,14 +136,12 @@ template< int blockSize,
           typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionWithArgumentKernel( const Result zero,
                                  const DataFetcher dataFetcher,
                                  const Reduction reduction,
-                                 const VolatileReduction volatileReduction,
                                  const Index size,
                                  Result* output,
                                  Index* idxOutput,
@@ -230,26 +230,29 @@ CudaReductionWithArgumentKernel( const Result zero,
       __syncthreads();
    }
 
-   // This runs in one warp so it is synchronized implicitly.
+   // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( tid < 32 ) {
-      volatile Result* vsdata = sdata;
-      volatile Index* vsidx = sidx;
       if( blockSize >= 64 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] );
+         reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] );
+      __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
       // 2 * blockSize.x elements of shared memory per block, so we do not
       // access out of bounds. The results for the upper half will be undefined,
       // but unused anyway.
       if( blockSize >= 32 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] );
+         reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] );
+      __syncwarp();
       if( blockSize >= 16 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] );
+         reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] );
+      __syncwarp();
       if( blockSize >=  8 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] );
+         reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] );
+      __syncwarp();
       if( blockSize >=  4 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] );
+         reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] );
+      __syncwarp();
       if( blockSize >=  2 )
-         volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] );
+         reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
@@ -289,10 +292,8 @@ struct CudaReductionKernelLauncher
    }
 
    template< typename DataFetcher,
-             typename Reduction,
-             typename VolatileReduction >
+             typename Reduction >
    int start( const Reduction& reduction,
-              const VolatileReduction& volatileReduction,
               const DataFetcher& dataFetcher,
               const Result& zero,
               Result*& output )
@@ -303,15 +304,13 @@ struct CudaReductionKernelLauncher
       cudaReductionBuffer.setSize( buf_size );
       output = cudaReductionBuffer.template getData< Result >();
 
-      this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
+      this->reducedSize = this->launch( originalSize, reduction, dataFetcher, zero, output );
       return this->reducedSize;
    }
 
    template< typename DataFetcher,
-             typename Reduction,
-             typename VolatileReduction >
+             typename Reduction >
    int startWithArgument( const Reduction& reduction,
-                          const VolatileReduction& volatileReduction,
                           const DataFetcher& dataFetcher,
                           const Result& zero,
                           Result*& output,
@@ -324,15 +323,13 @@ struct CudaReductionKernelLauncher
       output = cudaReductionBuffer.template getData< Result >();
       idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] );
 
-      this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr );
+      this->reducedSize = this->launchWithArgument( originalSize, reduction, dataFetcher, zero, output, idxOutput, nullptr );
       return this->reducedSize;
    }
 
-   template< typename Reduction,
-             typename VolatileReduction >
+   template< typename Reduction >
    Result
    finish( const Reduction& reduction,
-           const VolatileReduction& volatileReduction,
            const Result& zero )
    {
       // Input is the first half of the buffer, output is the second half
@@ -344,7 +341,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
+         this->reducedSize = this->launch( this->reducedSize, reduction, copyFetch, zero, output );
          std::swap( input, output );
       }
 
@@ -358,11 +355,9 @@ struct CudaReductionKernelLauncher
       return result;
    }
 
-   template< typename Reduction,
-             typename VolatileReduction >
+   template< typename Reduction >
    std::pair< Index, Result >
    finishWithArgument( const Reduction& reduction,
-                       const VolatileReduction& volatileReduction,
                        const Result& zero )
    {
       // Input is the first half of the buffer, output is the second half
@@ -376,7 +371,7 @@ struct CudaReductionKernelLauncher
       {
          // this lambda has to be defined inside the loop, because the captured variable changes
          auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; };
-         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput );
+         this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput );
          std::swap( input, output );
          std::swap( idxInput, idxOutput );
       }
@@ -397,11 +392,9 @@ struct CudaReductionKernelLauncher
 
    protected:
       template< typename DataFetcher,
-                typename Reduction,
-                typename VolatileReduction >
+                typename Reduction >
       int launch( const Index size,
                   const Reduction& reduction,
-                  const VolatileReduction& volatileReduction,
                   const DataFetcher& dataFetcher,
                   const Result& zero,
                   Result* output )
@@ -425,55 +418,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionKernel< 512 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 256 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel< 128 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  64 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  32 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<  16 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   8 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   4 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionKernel<   2 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -485,10 +478,10 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
             CudaReductionKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          }
          else {
             TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
@@ -502,11 +495,9 @@ struct CudaReductionKernelLauncher
       }
 
       template< typename DataFetcher,
-                typename Reduction,
-                typename VolatileReduction >
+                typename Reduction >
       int launchWithArgument( const Index size,
                               const Reduction& reduction,
-                              const VolatileReduction& volatileReduction,
                               const DataFetcher& dataFetcher,
                               const Result& zero,
                               Result* output,
@@ -532,55 +523,55 @@ struct CudaReductionKernelLauncher
          {
             case 512:
                CudaReductionWithArgumentKernel< 512 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case 256:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 256 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case 128:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel< 128 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  64:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  64 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  32:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  32 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case  16:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<  16 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
            case   8:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   8 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   4:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   4 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   2:
-               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+               cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
                CudaReductionWithArgumentKernel<   2 >
-               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+               <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
                break;
             case   1:
                TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
@@ -592,10 +583,10 @@ struct CudaReductionKernelLauncher
 
          // Check just to future-proof the code setting blockSize.x
          if( blockSize.x == Reduction_maxThreadsPerBlock ) {
-            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+            cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
             CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock >
-            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput );
+            <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput );
          }
          else {
             TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; );
diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
index f5c193c42..e96e8e649 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.hpp
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -196,7 +196,7 @@ reduce( const Result zero,
 
    // start the reduction on the GPU
    Result* deviceAux1 = nullptr;
-   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, volatileReduction, size, n, deviceAux1 );
+   const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index fd1d781c7..93bb14e3e 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -288,7 +288,6 @@ reduce( const Index size,
    Result* deviceAux1( 0 );
    Index reducedSize = reductionLauncher.start(
       reduction,
-      volatileReduction,
       dataFetcher,
       zero,
       deviceAux1 );
@@ -324,7 +323,7 @@ reduce( const Index size,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduction on the GPU
-      auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
+      auto result = reductionLauncher.finish( reduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -368,7 +367,6 @@ reduceWithArgument( const Index size,
    Index* deviceIndexes( nullptr );
    Index reducedSize = reductionLauncher.startWithArgument(
       reduction,
-      volatileReduction,
       dataFetcher,
       zero,
       deviceAux1,
@@ -409,7 +407,7 @@ reduceWithArgument( const Index size,
    }
    else {
       // data can't be safely reduced on host, so continue with the reduction on the GPU
-      auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero );
+      auto result = reductionLauncher.finishWithArgument( reduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
-- 
GitLab


From 13b89a716de4fe702aca8a37ac219cf60e556e65 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 19:20:02 +0200
Subject: [PATCH 05/23] Removed volatile reduction completely

---
 .../BLAS/CommonVectorOperations.hpp           | 51 ++++++------------
 .../Algorithms/ArrayOperationsCuda.hpp        |  9 ++--
 .../Containers/Algorithms/Multireduction.h    |  6 ---
 .../Containers/Algorithms/Multireduction.hpp  |  6 +--
 src/TNL/Containers/Algorithms/Reduction.h     |  8 ---
 src/TNL/Containers/Algorithms/Reduction.hpp   | 12 +----
 src/TNL/Containers/Expressions/Comparison.h   | 45 ++++++----------
 .../DistributedExpressionTemplates.h          | 24 +++------
 .../DistributedVerticalOperations.h           | 20 +------
 .../Expressions/ExpressionTemplates.h         | 24 +++------
 .../Expressions/StaticExpressionTemplates.h   | 12 -----
 .../Expressions/VerticalOperations.h          | 53 +++++--------------
 src/TNL/Solvers/Linear/GMRES_impl.h           |  4 --
 src/TNL/Solvers/ODE/Euler.hpp                 |  3 +-
 src/TNL/Solvers/ODE/Merson_impl.h             |  4 +-
 src/UnitTests/Containers/MultireductionTest.h |  2 -
 .../Containers/VectorEvaluateAndReduceTest.h  |  9 ++--
 17 files changed, 70 insertions(+), 222 deletions(-)

diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index 727a0abcf..b665de0ea 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -31,8 +31,7 @@ getVectorMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -49,8 +48,7 @@ getVectorMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -67,8 +65,7 @@ getVectorAbsMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -85,8 +82,7 @@ getVectorAbsMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -103,8 +99,7 @@ getVectorL1Norm( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -121,8 +116,7 @@ getVectorL2Norm( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -146,8 +140,7 @@ getVectorLpNorm( const Vector& v,
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -167,8 +160,7 @@ getVectorSum( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -188,8 +180,7 @@ getVectorDifferenceMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -209,8 +200,7 @@ getVectorDifferenceMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -230,8 +220,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -251,8 +240,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -272,8 +260,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -296,8 +283,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       return diff * diff;
    };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -324,8 +310,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -345,8 +330,7 @@ getVectorDifferenceSum( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -366,8 +350,7 @@ getScalarProduct( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
 } // namespace Benchmarks
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 8623c99de..120ef3801 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -135,8 +135,7 @@ compare( const Element1* destination,
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
 }
 
 template< typename Element,
@@ -153,8 +152,7 @@ containsValue( const Element* data,
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
 }
 
 template< typename Element,
@@ -171,8 +169,7 @@ containsOnlyValue( const Element* data,
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 00ca6078a..12ed22003 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -32,7 +32,6 @@ struct Multireduction< Devices::Host >
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
     *    reduction: callable object representing the reduction operation
-    *    volatileReduction: callable object representing the reduction operation
     *    size: the size of each dataset
     *    n: number of datasets to be reduced
     *    result: output array of size = n
@@ -40,13 +39,11 @@ struct Multireduction< Devices::Host >
    template< typename Result,
              typename DataFetcher,
              typename Reduction,
-             typename VolatileReduction,
              typename Index >
    static void
    reduce( const Result zero,
            DataFetcher dataFetcher,
            const Reduction reduction,
-           const VolatileReduction volatileReduction,
            const Index size,
            const int n,
            Result* result );
@@ -62,7 +59,6 @@ struct Multireduction< Devices::Cuda >
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
     *    reduction: callable object representing the reduction operation
-    *    volatileReduction: callable object representing the reduction operation
     *    size: the size of each dataset
     *    n: number of datasets to be reduced
     *    hostResult: output array of size = n
@@ -70,13 +66,11 @@ struct Multireduction< Devices::Cuda >
    template< typename Result,
              typename DataFetcher,
              typename Reduction,
-             typename VolatileReduction,
              typename Index >
    static void
    reduce( const Result zero,
            DataFetcher dataFetcher,
            const Reduction reduction,
-           const VolatileReduction volatileReduction,
            const Index size,
            const int n,
            Result* hostResult );
diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
index e96e8e649..2f1999303 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.hpp
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -33,14 +33,12 @@ namespace Algorithms {
 template< typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 void
 Multireduction< Devices::Host >::
 reduce( const Result zero,
         DataFetcher dataFetcher,
         const Reduction reduction,
-        const VolatileReduction volatileReduction,
         const Index size,
         const int n,
         Result* result )
@@ -173,14 +171,12 @@ reduce( const Result zero,
 template< typename Result,
           typename DataFetcher,
           typename Reduction,
-          typename VolatileReduction,
           typename Index >
 void
 Multireduction< Devices::Cuda >::
 reduce( const Result zero,
         DataFetcher dataFetcher,
         const Reduction reduction,
-        const VolatileReduction volatileReduction,
         const Index size,
         const int n,
         Result* hostResult )
@@ -218,7 +214,7 @@ reduce( const Result zero,
 
    // finish the reduction on the host
    auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
-   Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult );
+   Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index 6e5f9c12a..d4406332b 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -30,24 +30,20 @@ struct Reduction< Devices::Host >
    template< typename Index,
              typename Result,
              typename ReductionOperation,
-             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
-           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 
    template< typename Index,
              typename Result,
              typename ReductionOperation,
-             typename VolatileReductionOperation,
              typename DataFetcher >
    static std::pair< Index, Result >
    reduceWithArgument( const Index size,
                        ReductionOperation& reduction,
-                       VolatileReductionOperation& volatileReduction,
                        DataFetcher& dataFetcher,
                        const Result& zero );
 };
@@ -58,24 +54,20 @@ struct Reduction< Devices::Cuda >
    template< typename Index,
              typename Result,
              typename ReductionOperation,
-             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
-           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 
    template< typename Index,
              typename Result,
              typename ReductionOperation,
-             typename VolatileReductionOperation,
              typename DataFetcher >
    static std::pair< Index, Result >
    reduceWithArgument( const Index size,
                        ReductionOperation& reduction,
-                       VolatileReductionOperation& volatileReduction,
                        DataFetcher& dataFetcher,
                        const Result& zero );
 };
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 93bb14e3e..28ddfaf26 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -41,13 +41,11 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Host >::
 reduce( const Index size,
         ReductionOperation& reduction,
-        VolatileReductionOperation& volatileReduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
@@ -136,13 +134,11 @@ reduce( const Index size,
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 std::pair< Index, Result >
 Reduction< Devices::Host >::
 reduceWithArgument( const Index size,
                     ReductionOperation& reduction,
-                    VolatileReductionOperation& volatileReduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
@@ -261,13 +257,11 @@ reduceWithArgument( const Index size,
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Cuda >::
 reduce( const Index size,
         ReductionOperation& reduction,
-        VolatileReductionOperation& volatileReduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
@@ -313,7 +307,7 @@ reduce( const Index size,
 
       // finish the reduction on the host
       auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-      const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
+      const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -339,13 +333,11 @@ reduce( const Index size,
 template< typename Index,
           typename Result,
           typename ReductionOperation,
-          typename VolatileReductionOperation,
           typename DataFetcher >
 std::pair< Index, Result >
 Reduction< Devices::Cuda >::
 reduceWithArgument( const Index size,
                     ReductionOperation& reduction,
-                    VolatileReductionOperation& volatileReduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
@@ -395,7 +387,7 @@ reduceWithArgument( const Index size,
 
       // finish the reduction on the host
 //      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-//      const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero );
+//      const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
       for( Index i = 1; i < reducedSize; i++ )
          reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
 
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index ff533e781..e65ec4958 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -66,8 +66,7 @@ struct VectorComparison< T1, T2, false >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
 
@@ -98,8 +97,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -113,8 +111,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -128,8 +125,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -143,8 +139,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
 
@@ -161,8 +156,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -177,8 +171,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -188,8 +181,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -199,8 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -210,8 +201,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 };
 
@@ -228,8 +218,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -244,8 +233,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -255,8 +243,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -266,8 +253,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -277,8 +263,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; };
       auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
-      auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
 
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 405b0014a..70eb37cc8 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -2187,12 +2187,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2201,19 +2199,17 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2222,7 +2218,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2232,12 +2228,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2250,19 +2244,17 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2275,7 +2267,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2285,12 +2277,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2303,19 +2293,17 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2328,7 +2316,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index 4e569a45d..43940417b 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -70,15 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) {
-         if( a > b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::max() );
+      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::max() );
       result.first = gatheredResults[ result.first ].first;
    }
    return result;
@@ -135,15 +127,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) {
-         if( a < b ) {
-            a = b;
-            aIdx = bIdx;
-         }
-         else if( a == b && bIdx < aIdx )
-            aIdx = bIdx;
-      };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::lowest() );
       result.first = gatheredResults[ result.first ].first;
    }
    return result;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 5a069b0b9..8816f153b 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -2110,12 +2110,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2124,19 +2122,17 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2145,7 +2141,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2155,12 +2151,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2173,19 +2167,17 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2198,7 +2190,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 ////
@@ -2208,12 +2200,10 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2226,19 +2216,17 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2251,7 +2239,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index cce425923..0a3e8a7fb 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -2276,13 +2276,11 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
@@ -2295,13 +2293,11 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
@@ -2317,13 +2313,11 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
@@ -2339,13 +2333,11 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
@@ -2364,13 +2356,11 @@ template< typename Vector,
    typename T2,
    template< typename, typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
@@ -2386,13 +2376,11 @@ template< typename Vector,
    typename T1,
    template< typename > class Operation,
    typename Reduction,
-   typename VolatileReduction,
    typename Result >
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
    Reduction& reduction,
-   VolatileReduction& volatileReduction,
    const Result& zero )
 {
    Result result( zero );
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index a12505780..2470a7f22 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -33,8 +33,7 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a < b ? a : b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -53,15 +52,7 @@ auto ExpressionArgMin( const Expression& expression )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) {
-      if( a > b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -72,8 +63,7 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a > b ? a : b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -92,15 +82,7 @@ auto ExpressionArgMax( const Expression& expression )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) {
-      if( a < b ) {
-         a = b;
-         aIdx = bIdx;
-      }
-      else if( a == b && bIdx < aIdx )
-         aIdx = bIdx;
-   };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -111,8 +93,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -123,8 +104,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype(
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -135,8 +115,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype(
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 template< typename Expression, typename Real >
@@ -147,8 +126,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -159,8 +137,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a *= b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 1 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 );
 }
 
 template< typename Expression >
@@ -171,8 +148,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a && b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -183,8 +159,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a || b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -195,8 +170,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a & b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -207,8 +181,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a | b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index d1ef7bc97..725ba3597 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -440,12 +440,10 @@ hauseholder_generate( const int i,
       const IndexType ldSize = this->ldSize;
       auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
       auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
       Containers::Algorithms::Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
                  reduction,
-                 volatileReduction,
                  size,
                  i,
                  aux );
@@ -552,12 +550,10 @@ hauseholder_cwy_transposed( VectorViewType z,
    const IndexType ldSize = this->ldSize;
    auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
    auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
    Containers::Algorithms::Multireduction< DeviceType >::reduce
             ( (RealType) 0,
               fetch,
               reduction,
-              volatileReduction,
               size,
               i + 1,
               aux );
diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp
index ccc98daa7..3506152ff 100644
--- a/src/TNL/Solvers/ODE/Euler.hpp
+++ b/src/TNL/Solvers/ODE/Euler.hpp
@@ -120,8 +120,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u )
          }
       }
       auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
-      auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
-      this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+      this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
       /****
        * When time is close to stopTime the new residue
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index ce21298b0..55dbbdf4e 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -174,15 +174,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          time += currentTau;
 
          auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
-         auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; };
          this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
-            reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+            reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
          /////
          // When time is close to stopTime the new residue
          // may be inaccurate significantly.
          if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue );
-         
 
          if( ! this->nextIteration() )
             return false;
diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h
index 29d2800f3..a876af2c3 100644
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
@@ -115,12 +115,10 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect
       return _V[ i + k * size ] * _y[ i ];
    };
    auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
    Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
                  reduction,
-                 volatileReduction,
                  size,
                  n,
                  result.getData() );
diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
index d8c26b93c..147f5946f 100644
--- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
+++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
@@ -38,8 +38,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w )
    using RealType = typename VectorView::RealType;
 
    auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return evaluateAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
 }
 
 TYPED_TEST( VectorTest, evaluateAndReduce )
@@ -75,8 +74,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w )
    using RealType = typename VectorView::RealType;
 
    auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
 }
 
 template< typename VectorView >
@@ -86,8 +84,7 @@ performAddAndReduce2( VectorView& v, VectorView& w )
    using RealType = typename VectorView::RealType;
 
    auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 );
+   return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 );
 }
 
 
-- 
GitLab


From 1777e488d4e00559b3b1a68a66b2ffd204f66f15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Sun, 11 Aug 2019 20:50:34 +0200
Subject: [PATCH 06/23] Removed ReductionOperations.h

---
 .../Algorithms/ArrayOperationsCuda.hpp        |   1 -
 .../Algorithms/ArrayOperationsHost.hpp        |   1 -
 .../Algorithms/ArrayOperationsMIC.hpp         |   1 -
 .../Algorithms/CudaPrefixSumKernel.h          |   1 -
 src/TNL/Containers/Algorithms/PrefixSum.hpp   |   1 -
 .../Algorithms/ReductionOperations.h          | 641 ------------------
 src/TNL/Containers/DistributedVector.hpp      |   1 -
 src/TNL/Containers/DistributedVectorView.hpp  |   1 -
 8 files changed, 648 deletions(-)
 delete mode 100644 src/TNL/Containers/Algorithms/ReductionOperations.h

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 120ef3801..3322a910e 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -19,7 +19,6 @@
 #include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
index f7c164d06..3a54a862f 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp
@@ -17,7 +17,6 @@
 #include <TNL/ParallelFor.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
index 5bf04dbaf..4113bbcd9 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp
@@ -18,7 +18,6 @@
 #include <TNL/Exceptions/MICSupportMissing.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 40a662b08..0bfbe80ba 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -15,7 +15,6 @@
 #include <TNL/Math.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Exceptions/CudaBadAlloc.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Containers/Array.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index d3c3b3071..1ffd4cf4e 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -18,7 +18,6 @@
 
 #include <TNL/Assert.h>
 #include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Containers/Algorithms/ArrayOperations.h>
 #include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
 #include <TNL/Exceptions/NotImplementedError.h>
diff --git a/src/TNL/Containers/Algorithms/ReductionOperations.h b/src/TNL/Containers/Algorithms/ReductionOperations.h
deleted file mode 100644
index 33ef84b1c..000000000
--- a/src/TNL/Containers/Algorithms/ReductionOperations.h
+++ /dev/null
@@ -1,641 +0,0 @@
-/***************************************************************************
-                          ReductionOperations.h  -  description
-                             -------------------
-    begin                : Mar 22, 2013
-    copyright            : (C) 2013 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <limits>  // std::numeric_limits
-
-#include <TNL/Math.h>
-#include <TNL/Devices/CudaCallable.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-/*
- * Unary operations: reduction on one input vector.
- */
-
-template< typename Data, typename Result = Data >
-class ParallelReductionSum
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const ResultType& data )
-   {
-      result += data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const ResultType& data )
-   {
-      result += data;
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionMin
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, data1[ index ] );
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = TNL::min( result, data );
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = TNL::min( result, data );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionMax
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, data1[ index ] );
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = TNL::max( result, data );
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = TNL::max( result, data );
-   }
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionLogicalAnd
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return true; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = result && data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = result && data;
-   }
-};
-
-
-template< typename Data, typename Result = bool >
-class ParallelReductionLogicalOr
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalOr< Result >;
-
-   static constexpr Result initialValue() { return false; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result || data1[ index ];
-   }
-
-   __cuda_callable__ void
-   commonReduction( ResultType& result,
-                    const Result& data )
-   {
-      result = result || data;
-   }
-
-   __cuda_callable__ void
-   commonReduction( volatile ResultType& result,
-                    volatile const Result& data )
-   {
-      result = result || data;
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsSum : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::abs( data1[ index ] );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsMin : public ParallelReductionMin< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, TNL::abs( data1[ index ] ) );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionAbsMax : public ParallelReductionMax< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, TNL::abs( data1[ index ] ) );
-   }
-};
-
-template< typename Data, typename Result = Data >
-class ParallelReductionL2Norm : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      const Data& aux = data1[ index ];
-      result += aux * aux;
-   }
-};
-
-
-template< typename Data, typename Result = Data, typename PType = Data >
-class ParallelReductionLpNorm : public ParallelReductionSum< Data, Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = void;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   void setPower( const PType p )
-   {
-      this->p = p;
-   }
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::pow( TNL::abs( data1[ index ] ), p );
-   }
-
-protected:
-   PType p;
-};
-
-
-/*
- * Binary operations: reduction on two input vectors.
- */
-
-template< typename Data1, typename Data2, typename Result = bool >
-class ParallelReductionEqualities : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return true; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] == data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = bool >
-class ParallelReductionInequalities : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   static constexpr Result initialValue() { return false; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] != data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionScalarProduct : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ] * data2[ index ];
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffSum : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += data1[ index ] - data2[ index ];
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffMin : public ParallelReductionMin< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffMax : public ParallelReductionMax< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsSum : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::abs( data1[ index ] - data2[ index ] );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsMin : public ParallelReductionMin< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMin< Result >;
-
-   static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffAbsMax : public ParallelReductionMax< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionMax< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) );
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1 >
-class ParallelReductionDiffL2Norm : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      const ResultType aux = data2[ index ] - data1[ index ];
-      result += aux * aux;
-   }
-};
-
-template< typename Data1, typename Data2, typename Result = Data1, typename PType = Data1 >
-class ParallelReductionDiffLpNorm : public ParallelReductionSum< Result, Result >
-{
-public:
-   using DataType1 = Data1;
-   using DataType2 = Data2;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionSum< Result >;
-
-   void setPower( const PType p )
-   {
-      this->p = p;
-   }
-
-   static constexpr Result initialValue() { return 0; };
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p );
-   }
-
-protected:
-   PType p;
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionContainsValue : public ParallelReductionLogicalOr< Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = Data;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalOr< Result >;
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result || ( data1[ index ] == value );
-   }
-
-   void setValue( const Data& v )
-   {
-      this->value = v;
-   }
-
-protected:
-   Data value;
-};
-
-template< typename Data, typename Result = bool >
-class ParallelReductionContainsOnlyValue : public ParallelReductionLogicalAnd< Result >
-{
-public:
-   using DataType1 = Data;
-   using DataType2 = Data;
-   using ResultType = Result;
-   using LaterReductionOperation = ParallelReductionLogicalAnd< Result >;
-
-   template< typename Index >
-   __cuda_callable__ void
-   firstReduction( ResultType& result,
-                   const Index& index,
-                   const DataType1* data1,
-                   const DataType2* data2 )
-   {
-      result = result && ( data1[ index ] == value );
-   }
-
-   void setValue( const Data& v )
-   {
-      this->value = v;
-   }
-
-protected:
-   Data value;
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index 73781c906..f50624c61 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVector.h"
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 38715161c..2f1e4666e 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -13,7 +13,6 @@
 #pragma once
 
 #include "DistributedVectorView.h"
-#include <TNL/Containers/Algorithms/ReductionOperations.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
 namespace TNL {
-- 
GitLab


From d0fc1bb7c6bd3d4fc9a6eeb34685b9aedec30414 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 12 Aug 2019 09:07:35 +0200
Subject: [PATCH 07/23] Removed VectorOperations class which is now useless

It contained only methods for prefixSum and segmentedPrefixSum, which
were identical for Host and Cuda, so they can be easily implemented
directly in Vector and VectorView.
---
 src/Benchmarks/BLAS/CommonVectorOperations.h  |  2 -
 .../BLAS/CommonVectorOperations.hpp           |  1 -
 src/TNL/Containers/Algorithms/PrefixSum.h     | 11 ++--
 src/TNL/Containers/Algorithms/PrefixSumType.h | 24 -------
 .../Containers/Algorithms/VectorAssignment.h  |  1 -
 .../Containers/Algorithms/VectorOperations.h  | 64 -------------------
 .../Algorithms/VectorOperationsCuda_impl.h    | 58 -----------------
 .../Algorithms/VectorOperationsHost_impl.h    | 55 ----------------
 src/TNL/Containers/Vector.hpp                 | 10 ++-
 src/TNL/Containers/VectorView.h               |  2 +-
 src/TNL/Containers/VectorView.hpp             | 12 ++--
 .../Containers/VectorBinaryOperationsTest.h   |  1 -
 .../Containers/VectorEvaluateAndReduceTest.h  |  1 -
 .../Containers/VectorPrefixSumTest.h          | 30 ++++-----
 src/UnitTests/Containers/VectorTestSetup.h    |  4 --
 .../Containers/VectorUnaryOperationsTest.h    |  1 -
 .../Containers/VectorVerticalOperationsTest.h |  1 -
 17 files changed, 38 insertions(+), 240 deletions(-)
 delete mode 100644 src/TNL/Containers/Algorithms/PrefixSumType.h
 delete mode 100644 src/TNL/Containers/Algorithms/VectorOperations.h
 delete mode 100644 src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
 delete mode 100644 src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h

diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.h b/src/Benchmarks/BLAS/CommonVectorOperations.h
index 504da8fa6..ed2915368 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.h
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.h
@@ -10,8 +10,6 @@
 
 #pragma once
 
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
-
 namespace TNL {
 namespace Benchmarks {
 
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index b665de0ea..033e99301 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -11,7 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Containers/Algorithms/PrefixSum.h>
 #include "CommonVectorOperations.h"
 
 namespace TNL {
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h
index 715d6f1b9..53d6d7f83 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.h
+++ b/src/TNL/Containers/Algorithms/PrefixSum.h
@@ -14,20 +14,23 @@
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-#include <TNL/Devices/MIC.h>
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
 
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
+enum class PrefixSumType {
+   Exclusive,
+   Inclusive
+};
+
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class PrefixSum {};
+class PrefixSum;
 
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class SegmentedPrefixSum {};
+class SegmentedPrefixSum;
 
 
 template< PrefixSumType Type >
diff --git a/src/TNL/Containers/Algorithms/PrefixSumType.h b/src/TNL/Containers/Algorithms/PrefixSumType.h
deleted file mode 100644
index ba03adfe6..000000000
--- a/src/TNL/Containers/Algorithms/PrefixSumType.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/***************************************************************************
-                          PrefixSumType.h  -  description
-                             -------------------
-    begin                : Jun 6, 2019
-    copyright            : (C) 2019 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-enum class PrefixSumType {
-   Exclusive,
-   Inclusive
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/VectorAssignment.h b/src/TNL/Containers/Algorithms/VectorAssignment.h
index 36829be82..c861579f4 100644
--- a/src/TNL/Containers/Algorithms/VectorAssignment.h
+++ b/src/TNL/Containers/Algorithms/VectorAssignment.h
@@ -12,7 +12,6 @@
 
 #include <TNL/TypeTraits.h>
 #include <TNL/ParallelFor.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h
deleted file mode 100644
index a12b310e3..000000000
--- a/src/TNL/Containers/Algorithms/VectorOperations.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/***************************************************************************
-                          VectorOperations.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Containers/Algorithms/PrefixSum.h>
-#include <TNL/Containers/Algorithms/Reduction.h>
-#include <TNL/Devices/Host.h>
-#include <TNL/Devices/Cuda.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< typename Device >
-class VectorOperations{};
-
-template<>
-class VectorOperations< Devices::Host >
-{
-public:
-   template< Algorithms::PrefixSumType Type,
-             typename Vector >
-   static void prefixSum( Vector& v,
-                          const typename Vector::IndexType begin,
-                          const typename Vector::IndexType end );
-
-   template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-   static void segmentedPrefixSum( Vector& v,
-                                   Flags& f,
-                                   const typename Vector::IndexType begin,
-                                   const typename Vector::IndexType end );
-};
-
-template<>
-class VectorOperations< Devices::Cuda >
-{
-public:
-   template< Algorithms::PrefixSumType Type,
-             typename Vector >
-   static void prefixSum( Vector& v,
-                          const typename Vector::IndexType begin,
-                          const typename Vector::IndexType end );
-
-   template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-   static void segmentedPrefixSum( Vector& v,
-                                   Flags& f,
-                                   const typename Vector::IndexType begin,
-                                   const typename Vector::IndexType end );
-};
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
-
-#include <TNL/Containers/Algorithms/VectorOperationsHost_impl.h>
-#include <TNL/Containers/Algorithms/VectorOperationsCuda_impl.h>
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
deleted file mode 100644
index 4b53dbcf7..000000000
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/***************************************************************************
-                          VectorOperationsCuda_impl.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-#include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< Algorithms::PrefixSumType Type,
-          typename Vector >
-void
-VectorOperations< Devices::Cuda >::
-prefixSum( Vector& v,
-           typename Vector::IndexType begin,
-           typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   PrefixSum< Devices::Cuda, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-void
-VectorOperations< Devices::Cuda >::
-segmentedPrefixSum( Vector& v,
-                    Flags& f,
-                    typename Vector::IndexType begin,
-                    typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   SegmentedPrefixSum< Devices::Cuda, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
deleted file mode 100644
index 50d591b9f..000000000
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/***************************************************************************
-                          VectorOperationsHost_impl.h  -  description
-                             -------------------
-    begin                : Nov 8, 2012
-    copyright            : (C) 2012 by Tomas Oberhuber
-    email                : tomas.oberhuber@fjfi.cvut.cz
- ***************************************************************************/
-
-/* See Copyright Notice in tnl/Copyright */
-
-#pragma once
-
-#include <TNL/Math.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-
-namespace TNL {
-namespace Containers {
-namespace Algorithms {
-
-template< Algorithms::PrefixSumType Type, typename Vector >
-void
-VectorOperations< Devices::Host >::
-prefixSum( Vector& v,
-           typename Vector::IndexType begin,
-           typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   PrefixSum< Devices::Host, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-template< Algorithms::PrefixSumType Type, typename Vector, typename Flags >
-void
-VectorOperations< Devices::Host >::
-segmentedPrefixSum( Vector& v,
-                    Flags& f,
-                    typename Vector::IndexType begin,
-                    typename Vector::IndexType end )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-
-   SegmentedPrefixSum< Devices::Host, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 );
-}
-
-} // namespace Algorithms
-} // namespace Containers
-} // namespace TNL
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index 7338c8317..525ba6380 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -175,7 +175,10 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end );
+
+   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -190,7 +193,10 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end );
+
+   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h
index 1662afb7c..0d0395454 100644
--- a/src/TNL/Containers/VectorView.h
+++ b/src/TNL/Containers/VectorView.h
@@ -14,7 +14,7 @@
 
 #include <TNL/Containers/ArrayView.h>
 #include <TNL/Containers/Expressions/ExpressionTemplates.h>
-#include <TNL/Containers/Algorithms/PrefixSumType.h>
+#include <TNL/Containers/Algorithms/PrefixSum.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 986d2bc0f..057d402d7 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -11,8 +11,6 @@
 #pragma once
 
 #include <TNL/Containers/VectorView.h>
-#include <TNL/Containers/Algorithms/VectorOperations.h>
-#include <TNL/Containers/VectorViewExpressions.h>
 #include <TNL/Containers/Algorithms/VectorAssignment.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
@@ -127,7 +125,10 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end );
+
+   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -141,7 +142,10 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-   Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end );
+
+   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
index 8fae76884..93283483c 100644
--- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace binary_tests {
 
diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
index 147f5946f..7c68c86f7 100644
--- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
+++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
@@ -22,7 +22,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 using namespace TNL::Arithmetics;
 
 // should be small enough to have fast tests, but larger than minGPUReductionDataSize
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 733013933..aba569df7 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -20,7 +20,6 @@ constexpr int VECTOR_TEST_SIZE = 10000;
 TYPED_TEST( VectorTest, prefixSum )
 {
    using VectorType = typename TestFixture::VectorType;
-   using VectorOperations = typename TestFixture::VectorOperations;
    using ViewType = typename TestFixture::ViewType;
    using RealType = typename VectorType::RealType;
    using DeviceType = typename VectorType::DeviceType;
@@ -79,7 +78,7 @@ TYPED_TEST( VectorTest, prefixSum )
       v = 0;
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -87,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum )
       setLinearSequence( v );
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
@@ -95,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -103,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum )
       v = 0;
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -111,11 +110,11 @@ TYPED_TEST( VectorTest, prefixSum )
       setLinearSequence( v );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
@@ -123,7 +122,6 @@ TYPED_TEST( VectorTest, prefixSum )
 TYPED_TEST( VectorTest, exclusivePrefixSum )
 {
    using VectorType = typename TestFixture::VectorType;
-   using VectorOperations = typename TestFixture::VectorOperations;
    using ViewType = typename TestFixture::ViewType;
    using RealType = typename VectorType::RealType;
    using DeviceType = typename VectorType::DeviceType;
@@ -186,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 );
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 );
 
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -199,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -207,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
@@ -215,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -223,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -231,11 +229,11 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
       v_host = v_view;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
-      CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
 }
diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h
index 7338f9637..5c342dced 100644
--- a/src/UnitTests/Containers/VectorTestSetup.h
+++ b/src/UnitTests/Containers/VectorTestSetup.h
@@ -8,8 +8,6 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-// NOTE: Vector = Array + VectorOperations, so we test Vector and VectorOperations at the same time
-
 #pragma once
 
 #ifdef HAVE_GTEST
@@ -24,7 +22,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 using namespace TNL::Arithmetics;
 
 // test fixture for typed tests
@@ -33,7 +30,6 @@ class VectorTest : public ::testing::Test
 {
 protected:
    using VectorType = Vector;
-   using VectorOperations = Algorithms::VectorOperations< typename VectorType::DeviceType >;
    using ViewType = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >;
 };
 
diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
index a0aa2695f..122404253 100644
--- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h
+++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace unary_tests {
 
diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
index 758ee1d50..93ff286ae 100644
--- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h
+++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h
@@ -31,7 +31,6 @@
 
 using namespace TNL;
 using namespace TNL::Containers;
-using namespace TNL::Containers::Algorithms;
 
 namespace vertical_tests {
 
-- 
GitLab


From e20a09309a4a02fc387e4ae8b80b435aa4e79935 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 12 Aug 2019 12:48:06 +0200
Subject: [PATCH 08/23] Changed reduction operation to use functions with
 `return a + b` instead of `a += b`

This is nicer because it more clearly separates data load, computation
and data store. Furthermore, it allows to use instances of std::plus,
std::logical_and, std::logical_or, etc. instead of custom lambda
functions.
---
 .../BLAS/CommonVectorOperations.hpp           | 56 +++++++++----------
 .../Algorithms/ArrayOperationsCuda.hpp        | 12 ++--
 .../Algorithms/CudaMultireductionKernel.h     | 34 +++++------
 .../Algorithms/CudaReductionKernel.h          | 34 +++++------
 .../Containers/Algorithms/Multireduction.hpp  | 38 ++++++-------
 src/TNL/Containers/Algorithms/Reduction.hpp   | 37 ++++++------
 src/TNL/Containers/Expressions/Comparison.h   | 30 +++++-----
 .../DistributedExpressionTemplates.h          | 12 ++--
 .../Expressions/ExpressionTemplates.h         | 12 ++--
 .../Expressions/StaticExpressionTemplates.h   | 24 ++++----
 .../Expressions/VerticalOperations.h          | 26 ++++-----
 src/TNL/Solvers/Linear/GMRES_impl.h           |  4 +-
 src/TNL/Solvers/ODE/Euler.hpp                 |  2 +-
 src/TNL/Solvers/ODE/Merson_impl.h             |  2 +-
 src/UnitTests/Containers/MultireductionTest.h |  2 +-
 .../Containers/VectorEvaluateAndReduceTest.h  |  6 +-
 16 files changed, 165 insertions(+), 166 deletions(-)

diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index 033e99301..cc6d0baad 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -29,7 +29,7 @@ getVectorMax( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
@@ -46,7 +46,7 @@ getVectorMin( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -63,7 +63,7 @@ getVectorAbsMax( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
@@ -80,7 +80,7 @@ getVectorAbsMin( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
@@ -113,8 +113,8 @@ getVectorL2Norm( const Vector& v )
    using IndexType = typename Vector::IndexType;
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
 }
 
@@ -137,8 +137,8 @@ getVectorLpNorm( const Vector& v,
       return getVectorL2Norm< Vector, ResultType >( v );
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
@@ -157,8 +157,8 @@ getVectorSum( const Vector& v )
    using IndexType = typename Vector::IndexType;
 
    const auto* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
@@ -177,8 +177,8 @@ getVectorDifferenceMax( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
@@ -197,8 +197,8 @@ getVectorDifferenceMin( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -217,8 +217,8 @@ getVectorDifferenceAbsMax( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
@@ -237,8 +237,8 @@ getVectorDifferenceAbsMin( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -257,8 +257,8 @@ getVectorDifferenceL1Norm( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
@@ -281,7 +281,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       auto diff = data1[ i ] - data2[ i ];
       return diff * diff;
    };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
 }
 
@@ -307,8 +307,8 @@ getVectorDifferenceLpNorm( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
@@ -327,8 +327,8 @@ getVectorDifferenceSum( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
@@ -347,8 +347,8 @@ getScalarProduct( const Vector1& v1,
 
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
 }
 
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 3322a910e..4a75c6fff 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -132,8 +132,8 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
+   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
    return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
 }
 
@@ -149,8 +149,8 @@ containsValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, (Index) 0, "" );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a || b; };
    return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
 }
 
@@ -166,8 +166,8 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
+   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
    return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
 }
 
diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
index 782e66eaa..e67c11b41 100644
--- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h
@@ -67,19 +67,19 @@ CudaMultireductionKernel( const Result zero,
 
    // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSizeX < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid,                 y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX,     y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) );
       gid += 4 * gridSizeX;
    }
    while( gid + 2 * gridSizeX < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid, y ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) );
       gid += 2 * gridSizeX;
    }
    while( gid < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid, y ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) );
       gid += gridSizeX;
    }
    __syncthreads();
@@ -87,48 +87,48 @@ CudaMultireductionKernel( const Result zero,
    // Perform the parallel reduction.
    if( blockSizeX >= 1024 ) {
       if( threadIdx.x < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSizeX >= 512 ) {
       if( threadIdx.x < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSizeX >= 256 ) {
       if( threadIdx.x < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
    }
    if( blockSizeX >= 128 ) {
       if( threadIdx.x <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
    }
 
    // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( threadIdx.x < 32 ) {
       if( blockSizeX >= 64 )
-         reduction( sdata[ tid ], sdata[ tid + 32 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
       __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
       // 2 * blockSize.x elements of shared memory per block, so we do not
       // access out of bounds. The results for the upper half will be undefined,
       // but unused anyway.
       if( blockSizeX >= 32 )
-         reduction( sdata[ tid ], sdata[ tid + 16 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
       __syncwarp();
       if( blockSizeX >= 16 )
-         reduction( sdata[ tid ], sdata[ tid + 8 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
       __syncwarp();
       if( blockSizeX >=  8 )
-         reduction( sdata[ tid ], sdata[ tid + 4 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
       __syncwarp();
       if( blockSizeX >=  4 )
-         reduction( sdata[ tid ], sdata[ tid + 2 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
       __syncwarp();
       if( blockSizeX >=  2 )
-         reduction( sdata[ tid ], sdata[ tid + 1 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 03da1e556..ce278c7f1 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -63,19 +63,19 @@ CudaReductionKernel( const Result zero,
 
    // Start with the sequential reduction and push the result into the shared memory.
    while( gid + 4 * gridSize < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
-      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size ) {
-      reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
@@ -83,48 +83,48 @@ CudaReductionKernel( const Result zero,
    // Perform the parallel reduction.
    if( blockSize >= 1024 ) {
       if( tid < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 ) {
       if( tid < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 ) {
       if( tid < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
    }
    if( blockSize >= 128 ) {
       if( tid <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
    }
 
    // This runs in one warp so we use __syncwarp() instead of __syncthreads().
    if( tid < 32 ) {
       if( blockSize >= 64 )
-         reduction( sdata[ tid ], sdata[ tid + 32 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] );
       __syncwarp();
       // Note that here we do not have to check if tid < 16 etc, because we have
       // 2 * blockSize.x elements of shared memory per block, so we do not
       // access out of bounds. The results for the upper half will be undefined,
       // but unused anyway.
       if( blockSize >= 32 )
-         reduction( sdata[ tid ], sdata[ tid + 16 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] );
       __syncwarp();
       if( blockSize >= 16 )
-         reduction( sdata[ tid ], sdata[ tid + 8 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] );
       __syncwarp();
       if( blockSize >=  8 )
-         reduction( sdata[ tid ], sdata[ tid + 4 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] );
       __syncwarp();
       if( blockSize >=  4 )
-         reduction( sdata[ tid ], sdata[ tid + 2 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] );
       __syncwarp();
       if( blockSize >=  2 )
-         reduction( sdata[ tid ], sdata[ tid + 1 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] );
    }
 
    // Store the result back in the global memory.
diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
index 2f1999303..4044884cb 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.hpp
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -72,10 +72,10 @@ reduce( const Result zero,
          for( int k = 0; k < n; k++ ) {
             Result* _r = r + 4 * k;
             for( int i = 0; i < block_size; i += 4 ) {
-               reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
-               reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
-               reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
-               reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+               _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+               _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+               _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
             }
          }
       }
@@ -86,23 +86,23 @@ reduce( const Result zero,
          for( int k = 0; k < n; k++ ) {
             Result* _r = r + 4 * k;
             for( Index i = blocks * block_size; i < size; i++ )
-               reduction( _r[ 0 ], dataFetcher( i, k ) );
+               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
          }
       }
 
       // local reduction of unrolled results
       for( int k = 0; k < n; k++ ) {
          Result* _r = r + 4 * k;
-         reduction( _r[ 0 ], _r[ 1 ] );
-         reduction( _r[ 0 ], _r[ 2 ] );
-         reduction( _r[ 0 ], _r[ 3 ] );
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
       }
 
       // inter-thread reduction of local results
       #pragma omp critical
       {
          for( int k = 0; k < n; k++ )
-            reduction( result[ k ], r[ 4 * k ] );
+            result[ k ] = reduction( result[ k ], r[ 4 * k ] );
       }
    }
    else {
@@ -120,10 +120,10 @@ reduce( const Result zero,
             for( int k = 0; k < n; k++ ) {
                Result* _r = r + 4 * k;
                for( int i = 0; i < block_size; i += 4 ) {
-                  reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
-                  reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
-                  reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
-                  reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
                }
             }
          }
@@ -132,15 +132,15 @@ reduce( const Result zero,
          for( int k = 0; k < n; k++ ) {
             Result* _r = r + 4 * k;
             for( Index i = blocks * block_size; i < size; i++ )
-               reduction( _r[ 0 ], dataFetcher( i, k ) );
+               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
          }
 
          // reduction of unrolled results
          for( int k = 0; k < n; k++ ) {
             Result* _r = r + 4 * k;
-            reduction( _r[ 0 ], _r[ 1 ] );
-            reduction( _r[ 0 ], _r[ 2 ] );
-            reduction( _r[ 0 ], _r[ 3 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
 
             // copy the result into the output parameter
             result[ k ] = _r[ 0 ];
@@ -154,13 +154,13 @@ reduce( const Result zero,
             const Index offset = b * block_size;
             for( int k = 0; k < n; k++ ) {
                for( int i = 0; i < block_size; i++ )
-                  reduction( result[ k ], dataFetcher( offset + i, k ) );
+                  result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
             }
          }
 
          for( int k = 0; k < n; k++ ) {
             for( Index i = blocks * block_size; i < size; i++ )
-               reduction( result[ k ], dataFetcher( i, k ) );
+               result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
          }
       }
 #ifdef HAVE_OPENMP
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 28ddfaf26..19c84b9dd 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -65,10 +65,10 @@ reduce( const Index size,
          for( int b = 0; b < blocks; b++ ) {
             const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               reduction( r[ 0 ], dataFetcher( offset + i ) );
-               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
@@ -76,18 +76,18 @@ reduce( const Index size,
          #pragma omp single nowait
          {
             for( Index i = blocks * block_size; i < size; i++ )
-               reduction( r[ 0 ], dataFetcher( i ) );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
          }
 
          // local reduction of unrolled results
-         reduction( r[ 0 ], r[ 2 ] );
-         reduction( r[ 1 ], r[ 3 ] );
-         reduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            reduction( result, r[ 0 ] );
+            result = reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -102,28 +102,27 @@ reduce( const Index size,
          for( int b = 0; b < blocks; b++ ) {
             const Index offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               reduction( r[ 0 ], dataFetcher( offset + i ) );
-               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
          for( Index i = blocks * block_size; i < size; i++ )
-            reduction( r[ 0 ], dataFetcher( i ) );
-            //operation.dataFetcher( r[ 0 ], i, input1, input2 );
+            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
 
          // reduction of unrolled results
-         reduction( r[ 0 ], r[ 2 ] );
-         reduction( r[ 1 ], r[ 3 ] );
-         reduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
          return r[ 0 ];
       }
       else {
          Result result = zero;
          for( Index i = 0; i < size; i++ )
-            reduction( result, dataFetcher( i ) );
+            result = reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index e65ec4958..a84b17fec 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -65,7 +65,7 @@ struct VectorComparison< T1, T2, false >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
@@ -96,7 +96,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -110,7 +110,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -124,7 +124,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -138,7 +138,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
@@ -155,7 +155,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
@@ -170,7 +170,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
@@ -180,7 +180,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
@@ -190,7 +190,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 
@@ -200,7 +200,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
    }
 };
@@ -217,7 +217,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -232,7 +232,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -242,7 +242,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -252,7 +252,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 
@@ -262,7 +262,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; };
-      auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
       return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
    }
 };
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 70eb37cc8..1d3077b43 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -2190,7 +2190,7 @@ template< typename Vector,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2209,7 +2209,7 @@ template< typename Vector,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2231,7 +2231,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2254,7 +2254,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2280,7 +2280,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2303,7 +2303,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 8816f153b..f01d8a68e 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -2113,7 +2113,7 @@ template< typename Vector,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2132,7 +2132,7 @@ template< typename Vector,
    typename Result >
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2154,7 +2154,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2177,7 +2177,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2203,7 +2203,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
@@ -2226,7 +2226,7 @@ template< typename Vector,
    typename Result >
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    using RealType = typename Vector::RealType;
diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
index 0a3e8a7fb..1bafe7cfc 100644
--- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h
@@ -2280,12 +2280,12 @@ template< typename Vector,
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ )
-      reduction( result, lhs[ i ] = expression[ i ] );
+      result = reduction( result, lhs[ i ] = expression[ i ] );
    return result;
 }
 
@@ -2297,12 +2297,12 @@ template< typename Vector,
 __cuda_callable__
 Result evaluateAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ )
-      reduction( result, lhs[ i ] = expression[ i ] );
+      result = reduction( result, lhs[ i ] = expression[ i ] );
    return result;
 }
 
@@ -2317,14 +2317,14 @@ template< typename Vector,
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, aux );
+      result = reduction( result, aux );
    }
    return result;
 }
@@ -2337,14 +2337,14 @@ template< typename Vector,
 __cuda_callable__
 Result addAndReduce( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, aux );
+      result = reduction( result, aux );
    }
    return result;
 }
@@ -2360,14 +2360,14 @@ template< typename Vector,
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, TNL::abs( aux ) );
+      result = reduction( result, TNL::abs( aux ) );
    }
    return result;
 }
@@ -2380,14 +2380,14 @@ template< typename Vector,
 __cuda_callable__
 Result addAndReduceAbs( Vector& lhs,
    const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression,
-   Reduction& reduction,
+   const Reduction& reduction,
    const Result& zero )
 {
    Result result( zero );
    for( int i = 0; i < Vector::getSize(); i++ ) {
       const Result aux = expression[ i ];
       lhs[ i ] += aux;
-      reduction( result, TNL::abs( aux ) );
+      result = reduction( result, TNL::abs( aux ) );
    }
    return result;
 }
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 2470a7f22..29f97c985 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -32,7 +32,7 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -44,7 +44,7 @@ auto ExpressionArgMin( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
+   auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
       if( a > b ) {
          a = b;
          aIdx = bIdx;
@@ -62,7 +62,7 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
@@ -74,7 +74,7 @@ auto ExpressionArgMax( const Expression& expression )
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
+   auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) {
       if( a < b ) {
          a = b;
          aIdx = bIdx;
@@ -92,7 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
@@ -103,7 +103,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
@@ -114,7 +114,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
@@ -125,7 +125,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
@@ -136,7 +136,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a * b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 );
 }
 
@@ -147,7 +147,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a && b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -158,7 +158,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a || b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
@@ -169,7 +169,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a & b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
 }
 
@@ -180,7 +180,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; };
+   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a | b; };
    return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
 }
 
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index 725ba3597..cf87ea2b2 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -439,7 +439,7 @@ hauseholder_generate( const int i,
       const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
       const IndexType ldSize = this->ldSize;
       auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
-      auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+      auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
       Containers::Algorithms::Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
@@ -549,7 +549,7 @@ hauseholder_cwy_transposed( VectorViewType z,
    const RealType* _w = Traits::getConstLocalView( w ).getData();
    const IndexType ldSize = this->ldSize;
    auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    Containers::Algorithms::Multireduction< DeviceType >::reduce
             ( (RealType) 0,
               fetch,
diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp
index 3506152ff..a4ff45fef 100644
--- a/src/TNL/Solvers/ODE/Euler.hpp
+++ b/src/TNL/Solvers/ODE/Euler.hpp
@@ -119,7 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u )
             continue;
          }
       }
-      auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
+      auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
       this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
       /****
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 55dbbdf4e..67fd5e101 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -173,7 +173,7 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          RealType lastResidue = this->getResidue();
          time += currentTau;
 
-         auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; };
+         auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
          this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
             reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h
index a876af2c3..aeda3eeff 100644
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
@@ -114,7 +114,7 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect
       TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" );
       return _V[ i + k * size ] * _y[ i ];
    };
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
index 7c68c86f7..3ff2d932f 100644
--- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
+++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
@@ -36,7 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
 }
 
@@ -72,7 +72,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
 }
 
@@ -82,7 +82,7 @@ performAddAndReduce2( VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
+   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 );
 }
 
-- 
GitLab


From 0a57393f55caafab2c8b271cf98536700c30b209 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 12 Aug 2019 13:58:21 +0200
Subject: [PATCH 09/23] Replaced custom lambda functions with instances of STL
 types where possible

---
 .../BLAS/CommonVectorOperations.hpp           | 27 ++++-------
 .../Algorithms/ArrayOperationsCuda.hpp        |  9 ++--
 .../Containers/Algorithms/Multireduction.h    |  6 +++
 src/TNL/Containers/Algorithms/Reduction.h     |  9 ++--
 src/TNL/Containers/Algorithms/Reduction.hpp   |  8 ++--
 src/TNL/Containers/Expressions/Comparison.h   | 45 +++++++------------
 .../Expressions/VerticalOperations.h          | 27 ++++-------
 src/TNL/Solvers/Linear/GMRES_impl.h           |  6 +--
 src/TNL/Solvers/ODE/Euler.hpp                 |  3 +-
 src/TNL/Solvers/ODE/Merson_impl.h             |  3 +-
 src/UnitTests/Containers/MultireductionTest.h |  3 +-
 .../Containers/VectorEvaluateAndReduceTest.h  |  9 ++--
 12 files changed, 59 insertions(+), 96 deletions(-)

diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index cc6d0baad..640fda337 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -97,8 +97,7 @@ getVectorL1Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -114,8 +113,7 @@ getVectorL2Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) );
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -138,8 +136,7 @@ getVectorLpNorm( const Vector& v,
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -158,8 +155,7 @@ getVectorSum( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -258,8 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -281,8 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       auto diff = data1[ i ] - data2[ i ];
       return diff * diff;
    };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) );
+   return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -308,8 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -328,8 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -348,8 +340,7 @@ getScalarProduct( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 );
+   return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 );
 }
 
 } // namespace Benchmarks
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 4a75c6fff..46618fcd7 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -133,8 +133,7 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
-   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true );
 }
 
 template< typename Element,
@@ -150,8 +149,7 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, (Index) 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a || b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_or<>{}, fetch, false );
 }
 
 template< typename Element,
@@ -167,8 +165,7 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h
index 12ed22003..9802a2953 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.h
+++ b/src/TNL/Containers/Algorithms/Multireduction.h
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
+
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -32,6 +34,8 @@ struct Multireduction< Devices::Host >
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
     *    reduction: callable object representing the reduction operation
+    *               for example, it can be an instance of std::plus, std::logical_and,
+    *               std::logical_or etc.
     *    size: the size of each dataset
     *    n: number of datasets to be reduced
     *    result: output array of size = n
@@ -59,6 +63,8 @@ struct Multireduction< Devices::Cuda >
     *                 the i-th value to be reduced from the j-th dataset
     *                 (i = 0,...,size-1; j = 0,...,n-1)
     *    reduction: callable object representing the reduction operation
+    *               for example, it can be an instance of std::plus, std::logical_and,
+    *               std::logical_or etc.
     *    size: the size of each dataset
     *    n: number of datasets to be reduced
     *    hostResult: output array of size = n
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index d4406332b..41b000221 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -13,6 +13,7 @@
 #pragma once
 
 #include <utility>  // std::pair
+#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
@@ -33,7 +34,7 @@ struct Reduction< Devices::Host >
              typename DataFetcher >
    static Result
    reduce( const Index size,
-           ReductionOperation& reduction,
+           const ReductionOperation& reduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 
@@ -43,7 +44,7 @@ struct Reduction< Devices::Host >
              typename DataFetcher >
    static std::pair< Index, Result >
    reduceWithArgument( const Index size,
-                       ReductionOperation& reduction,
+                       const ReductionOperation& reduction,
                        DataFetcher& dataFetcher,
                        const Result& zero );
 };
@@ -57,7 +58,7 @@ struct Reduction< Devices::Cuda >
              typename DataFetcher >
    static Result
    reduce( const Index size,
-           ReductionOperation& reduction,
+           const ReductionOperation& reduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 
@@ -67,7 +68,7 @@ struct Reduction< Devices::Cuda >
              typename DataFetcher >
    static std::pair< Index, Result >
    reduceWithArgument( const Index size,
-                       ReductionOperation& reduction,
+                       const ReductionOperation& reduction,
                        DataFetcher& dataFetcher,
                        const Result& zero );
 };
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 19c84b9dd..c33065d19 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -45,7 +45,7 @@ template< typename Index,
 Result
 Reduction< Devices::Host >::
 reduce( const Index size,
-        ReductionOperation& reduction,
+        const ReductionOperation& reduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
@@ -137,7 +137,7 @@ template< typename Index,
 std::pair< Index, Result >
 Reduction< Devices::Host >::
 reduceWithArgument( const Index size,
-                    ReductionOperation& reduction,
+                    const ReductionOperation& reduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
@@ -260,7 +260,7 @@ template< typename Index,
 Result
 Reduction< Devices::Cuda >::
 reduce( const Index size,
-        ReductionOperation& reduction,
+        const ReductionOperation& reduction,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
@@ -336,7 +336,7 @@ template< typename Index,
 std::pair< Index, Result >
 Reduction< Devices::Cuda >::
 reduceWithArgument( const Index size,
-                    ReductionOperation& reduction,
+                    const ReductionOperation& reduction,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index a84b17fec..616ad5807 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -65,8 +65,7 @@ struct VectorComparison< T1, T2, false >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -96,8 +95,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -110,8 +108,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -124,8 +121,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -138,8 +134,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -155,8 +150,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -170,8 +164,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -180,8 +173,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -190,8 +182,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -200,8 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
       using IndexType = typename T2::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
@@ -217,8 +207,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -232,8 +221,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -242,8 +230,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -252,8 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -262,8 +248,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
       using IndexType = typename T1::IndexType;
 
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; };
-      auto reduction = [] __cuda_callable__ ( bool a, bool b ) { return a && b; };
-      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, fetch, true );
+      return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true );
    }
 };
 
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 29f97c985..29e904bbf 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -92,8 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -103,8 +102,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -114,8 +112,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype(
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression, typename Real >
@@ -125,8 +122,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a + b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -136,8 +132,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a * b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 1 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::multiplies<>{}, fetch, (ResultType) 1 );
 }
 
 template< typename Expression >
@@ -147,8 +142,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a && b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_and<>{}, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -158,8 +152,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a || b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_or<>{}, fetch, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -169,8 +162,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a & b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_and<>{}, fetch, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -180,8 +172,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp
    using IndexType = typename Expression::IndexType;
 
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; };
-   auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return a | b; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, (ResultType) 0 );
+   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_or<>{}, fetch, (ResultType) 0 );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h
index cf87ea2b2..7f316d730 100644
--- a/src/TNL/Solvers/Linear/GMRES_impl.h
+++ b/src/TNL/Solvers/Linear/GMRES_impl.h
@@ -439,11 +439,10 @@ hauseholder_generate( const int i,
       const RealType* _y_i = Traits::getConstLocalView( y_i ).getData();
       const IndexType ldSize = this->ldSize;
       auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; };
-      auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
       Containers::Algorithms::Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
-                 reduction,
+                 std::plus<>{},
                  size,
                  i,
                  aux );
@@ -549,11 +548,10 @@ hauseholder_cwy_transposed( VectorViewType z,
    const RealType* _w = Traits::getConstLocalView( w ).getData();
    const IndexType ldSize = this->ldSize;
    auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; };
-   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    Containers::Algorithms::Multireduction< DeviceType >::reduce
             ( (RealType) 0,
               fetch,
-              reduction,
+              std::plus<>{},
               size,
               i + 1,
               aux );
diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp
index a4ff45fef..12da6439b 100644
--- a/src/TNL/Solvers/ODE/Euler.hpp
+++ b/src/TNL/Solvers/ODE/Euler.hpp
@@ -119,8 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u )
             continue;
          }
       }
-      auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
-      this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+      this->setResidue( addAndReduceAbs( u, currentTau * k1, std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
       /****
        * When time is close to stopTime the new residue
diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h
index 67fd5e101..3c88576e9 100644
--- a/src/TNL/Solvers/ODE/Merson_impl.h
+++ b/src/TNL/Solvers/ODE/Merson_impl.h
@@ -173,9 +173,8 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u )
          RealType lastResidue = this->getResidue();
          time += currentTau;
 
-         auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
          this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ),
-            reduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
+            std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) );
 
          /////
          // When time is close to stopTime the new residue
diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h
index aeda3eeff..7a321f583 100644
--- a/src/UnitTests/Containers/MultireductionTest.h
+++ b/src/UnitTests/Containers/MultireductionTest.h
@@ -114,11 +114,10 @@ void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVect
       TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" );
       return _V[ i + k * size ] * _y[ i ];
    };
-   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
    Multireduction< DeviceType >::reduce
                ( (RealType) 0,
                  fetch,
-                 reduction,
+                 std::plus<>{},
                  size,
                  n,
                  result.getData() );
diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
index 3ff2d932f..8c0cd7f90 100644
--- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
+++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h
@@ -36,8 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
-   return evaluateAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
+   return evaluateAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 TYPED_TEST( VectorTest, evaluateAndReduce )
@@ -72,8 +71,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
-   return addAndReduce( w, u * v, reduction, ( RealType ) 0.0 );
+   return addAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 template< typename VectorView >
@@ -82,8 +80,7 @@ performAddAndReduce2( VectorView& v, VectorView& w )
 {
    using RealType = typename VectorView::RealType;
 
-   auto reduction = [] __cuda_callable__ ( const RealType& a, const RealType& b ) { return a + b; };
-   return addAndReduce( w, 5.0 * v, reduction, ( RealType ) 0.0 );
+   return addAndReduce( w, 5.0 * v, std::plus<>{}, ( RealType ) 0.0 );
 }
 
 
-- 
GitLab


From 32c69a1143406a8e11613273e3014e57dc4362f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Mon, 12 Aug 2019 17:46:53 +0200
Subject: [PATCH 10/23] Ugly workaround for nvcc's stupid modification of `new`
 expressions

---
 .../tnl-benchmark-simple-heat-equation.h      | 11 ++++-
 src/TNL/Containers/Algorithms/Reduction.hpp   | 49 +++++++++++++++++--
 2 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
index 33dff1ded..944e7f73f 100644
--- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
+++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h
@@ -259,12 +259,21 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters,
    const Index dofsCount = gridXSize * gridYSize;
    dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) );
    dim3 cudaUpdateBlockSize( 256 );
- 
+
    /****
     * Initiation
     */
+   // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+   // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])`
+   // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+   // Thus, the host compiler would spit out some warnings...
+   #ifdef __NVCC__
+   Real* u = new Real[ static_cast<const Index&>(dofsCount) ];
+   Real* aux = new Real[ static_cast<const Index&>(dofsCount) ];
+   #else
    Real* u = new Real[ dofsCount ];
    Real* aux = new Real[ dofsCount ];
+   #endif
    Real* max_du = new Real[ cudaUpdateBlocks.x ];
    if( ! u || ! aux )
    {
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index c33065d19..20265c6b0 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -279,7 +279,7 @@ reduce( const Index size,
 
    // start the reduction on the GPU
    Result* deviceAux1( 0 );
-   Index reducedSize = reductionLauncher.start(
+   const int reducedSize = reductionLauncher.start(
       reduction,
       dataFetcher,
       zero,
@@ -294,7 +294,20 @@ reduce( const Index size,
 
    if( can_reduce_later_on_host ) {
       // transfer the reduced data from device to host
-      std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] };
+      std::unique_ptr< Result[] > resultArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Result[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Result[ reducedSize ]
+         #endif
+      };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -356,7 +369,7 @@ reduceWithArgument( const Index size,
    // start the reduction on the GPU
    Result* deviceAux1( nullptr );
    Index* deviceIndexes( nullptr );
-   Index reducedSize = reductionLauncher.startWithArgument(
+   const int reducedSize = reductionLauncher.startWithArgument(
       reduction,
       dataFetcher,
       zero,
@@ -372,8 +385,34 @@ reduceWithArgument( const Index size,
 
    if( can_reduce_later_on_host ) {
       // transfer the reduced data from device to host
-      std::unique_ptr< Result[] > resultArray{ new Result[ reducedSize ] };
-      std::unique_ptr< Index[] > indexArray{ new Index[ reducedSize ] };
+      std::unique_ptr< Result[] > resultArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Result[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Result[ reducedSize ]
+         #endif
+      };
+      std::unique_ptr< Index[] > indexArray{
+         // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+         // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])`
+         // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+         // Thus, the host compiler would spit out hundreds of warnings...
+         // Funnily enough, nvcc's behaviour depends on the context rather than the
+         // expression, because exactly the same simple expression in different places
+         // does not produce warnings.
+         #ifdef __NVCC__
+         new Index[ static_cast<const int&>(reducedSize) ]
+         #else
+         new Index[ reducedSize ]
+         #endif
+      };
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
       ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );
 
-- 
GitLab


From 505d0b68c3d445db50c6b7d1e2cc20fd598d6a39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 18:40:50 +0200
Subject: [PATCH 11/23] Optimized OpenMP thread counts for reduction and
 multireduction

---
 .../Containers/Algorithms/Multireduction.hpp  | 88 ++++++++++---------
 src/TNL/Containers/Algorithms/Reduction.hpp   | 10 ++-
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp
index 4044884cb..f4407e620 100644
--- a/src/TNL/Containers/Algorithms/Multireduction.hpp
+++ b/src/TNL/Containers/Algorithms/Multireduction.hpp
@@ -50,59 +50,61 @@ reduce( const Result zero,
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 )
-#pragma omp parallel
-   {
-      // first thread initializes the result array
-      #pragma omp single nowait
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
       {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = zero;
-      }
+         // first thread initializes the result array
+         #pragma omp single nowait
+         {
+            for( int k = 0; k < n; k++ )
+               result[ k ] = zero;
+         }
 
-      // initialize array for thread-local results
-      // (it is accessed as a row-major matrix with n rows and 4 columns)
-      Result r[ n * 4 ];
-      for( int k = 0; k < n * 4; k++ )
-         r[ k ] = zero;
+         // initialize array for thread-local results
+         // (it is accessed as a row-major matrix with n rows and 4 columns)
+         Result r[ n * 4 ];
+         for( int k = 0; k < n * 4; k++ )
+            r[ k ] = zero;
 
-      #pragma omp for nowait
-      for( int b = 0; b < blocks; b++ ) {
-         const Index offset = b * block_size;
-         for( int k = 0; k < n; k++ ) {
-            Result* _r = r + 4 * k;
-            for( int i = 0; i < block_size; i += 4 ) {
-               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
-               _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
-               _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
-               _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+         #pragma omp for nowait
+         for( int b = 0; b < blocks; b++ ) {
+            const Index offset = b * block_size;
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( int i = 0; i < block_size; i += 4 ) {
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+               }
             }
          }
-      }
 
-      // the first thread that reaches here processes the last, incomplete block
-      #pragma omp single nowait
-      {
+         // the first thread that reaches here processes the last, incomplete block
+         #pragma omp single nowait
+         {
+            for( int k = 0; k < n; k++ ) {
+               Result* _r = r + 4 * k;
+               for( Index i = blocks * block_size; i < size; i++ )
+                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
+            }
+         }
+
+         // local reduction of unrolled results
          for( int k = 0; k < n; k++ ) {
             Result* _r = r + 4 * k;
-            for( Index i = blocks * block_size; i < size; i++ )
-               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
          }
-      }
 
-      // local reduction of unrolled results
-      for( int k = 0; k < n; k++ ) {
-         Result* _r = r + 4 * k;
-         _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
-         _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
-         _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
-      }
-
-      // inter-thread reduction of local results
-      #pragma omp critical
-      {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = reduction( result[ k ], r[ 4 * k ] );
+         // inter-thread reduction of local results
+         #pragma omp critical
+         {
+            for( int k = 0; k < n; k++ )
+               result[ k ] = reduction( result[ k ], r[ 4 * k ] );
+         }
       }
    }
    else {
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 20265c6b0..229af1379 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -53,10 +53,11 @@ reduce( const Index size,
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
       Result result = zero;
-#pragma omp parallel
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
          Result r[ 4 ] = { zero, zero, zero, zero  };
@@ -145,10 +146,11 @@ reduceWithArgument( const Index size,
    const int blocks = size / block_size;
 
 #ifdef HAVE_OPENMP
-   if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
+   if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
       std::pair< Index, Result > result( -1, zero );
-#pragma omp parallel
+      const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
+#pragma omp parallel num_threads(threads)
       {
          // initialize array for thread-local results
          Index arg[ 4 ] = { 0, 0, 0, 0 };
-- 
GitLab


From 232be124a0f776ce48e663d1d474183f641e9549 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 18:42:21 +0200
Subject: [PATCH 12/23] Benchmarks: added scalar multiplication with BLAS

---
 src/Benchmarks/BLAS/vector-operations.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index 25fa055bf..c7f1cf751 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -435,6 +435,11 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto multiplyCuda = [&]() {
       deviceVector *= 0.5;
    };
+#ifdef HAVE_BLAS
+   auto multiplyBlas = [&]() {
+      blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 );
+   };
+#endif
 #ifdef HAVE_CUDA
    auto multiplyCublas = [&]() {
       const Real alpha = 0.5;
@@ -445,6 +450,9 @@ benchmarkVectorOperations( Benchmark & benchmark,
 #endif
    benchmark.setOperation( "scalar multiplication", 2 * datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost );
+#ifdef HAVE_BLAS
+   benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas );
+#endif
 #ifdef HAVE_CUDA
    benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda );
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas );
-- 
GitLab


From e6e6cf46bd017d806a06ca138935cbf0721b6afb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 19:08:43 +0200
Subject: [PATCH 13/23] Removed timing parameter from benchmarks

Benchmarks can be easily profiled even without this parameter, so it was
just an unnecessary complication.
---
 src/Benchmarks/Benchmarks.h    | 48 +++++++++-------------------------
 src/Benchmarks/FunctionTimer.h | 33 +++++++++--------------
 2 files changed, 25 insertions(+), 56 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index 42b19814e..c4ae772a7 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -65,18 +65,17 @@ public:
    using Logging::MetadataMap;
    using Logging::MetadataColumns;
    using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >;
-   
+
    Benchmark( int loops = 10,
               bool verbose = true )
    : Logging(verbose), loops(loops)
    {}
-   
+
    static void configSetup( Config::ConfigDescription& config )
    {
       config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 );
       config.addEntry< bool >( "reset", "Call reset function between loops.", true );
       config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 );
-      config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true );
       config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 );
    }
 
@@ -85,7 +84,6 @@ public:
       this->loops = parameters.getParameter< int >( "loops" );
       this->reset = parameters.getParameter< bool >( "reset" );
       this->minTime = parameters.getParameter< double >( "min-time" );
-      this->timing = parameters.getParameter< bool >( "timing" );
       const int verbose = parameters.getParameter< int >( "verbose" );
       Logging::setVerbose( verbose );
    }
@@ -96,7 +94,7 @@ public:
    {
       this->loops = loops;
    }
-   
+
    void setMinTime( const double& minTime )
    {
       this->minTime = minTime;
@@ -121,7 +119,6 @@ public:
       metadata["loops"] = convertToString(loops);
       metadata["reset"] = convertToString( reset );
       metadata["minimal test time"] = convertToString( minTime );
-      metadata["timing"] = convertToString( timing );
       writeMetadata( metadata );
    }
 
@@ -208,28 +205,16 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->timing )
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            if( this->reset )
+               result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            if( this->timing )
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
+            if( this->reset )
+               result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               if( this->reset )
-                  result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor );
-               else
-                  result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+               result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          this->performedLoops = functionTimer.getPerformedLoops();
       }
@@ -248,7 +233,7 @@ public:
       return this->baseTime;
    }
 
-   template< typename Device, 
+   template< typename Device,
              typename ResetFunction,
              typename ComputeFunction,
              typename... NextComputations >
@@ -277,16 +262,10 @@ public:
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            if( this->timing )
-               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
-            else
-               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+            result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            if( this->timing )
-               result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor );
-            else
-               result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor );
+            result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
@@ -304,7 +283,7 @@ public:
       return this->baseTime;
    }
 
-   template< typename Device, 
+   template< typename Device,
              typename ComputeFunction,
              typename... NextComputations >
    inline double
@@ -345,7 +324,6 @@ protected:
    double minTime = 0.0;
    double datasetSize = 0.0;
    double baseTime = 0.0;
-   bool timing = true;
    bool reset = true;
    SolverMonitorType monitor;
 };
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index a76ebd558..1fad1e946 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -28,8 +28,7 @@ class FunctionTimer
    public:
       using DeviceType = Device;
 
-      template< bool timing,
-                typename ComputeFunction,
+      template< typename ComputeFunction,
                 typename ResetFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
       double
@@ -61,26 +60,24 @@ class FunctionTimer
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-            if( timing )
-               timer.start();
+            timer.start();
 
             for( loops = 0;
-                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops)
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops++ )
                compute();
             // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
             if( std::is_same< Device, Devices::Cuda >::value )
                cudaDeviceSynchronize();
 #endif
-            if( timing )
-               timer.stop();
+            timer.stop();
          }
          else
          {
             for( loops = 0;
-                 loops < maxLoops || ( timing && timer.getRealTime() < minTime );
-                 ++loops) 
+                 loops < maxLoops || timer.getRealTime() < minTime;
+                 loops++ )
             {
                // abuse the monitor's "time" for loops
                monitor.setTime( loops + 1 );
@@ -91,25 +88,19 @@ class FunctionTimer
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               if( timing )
-                  timer.start();
+               timer.start();
                compute();
 #ifdef HAVE_CUDA
                if( std::is_same< Device, Devices::Cuda >::value )
                   cudaDeviceSynchronize();
 #endif
-               if( timing )
-                  timer.stop();
+               timer.stop();
             }
          }
-         if( timing )
-            return timer.getRealTime() / ( double ) loops;
-         else
-            return std::numeric_limits<double>::quiet_NaN();
+         return timer.getRealTime() / ( double ) loops;
       }
 
-      template< bool timing,
-                typename ComputeFunction,
+      template< typename ComputeFunction,
                 typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
       double
       timeFunction( ComputeFunction compute,
@@ -119,7 +110,7 @@ class FunctionTimer
                     Monitor && monitor = Monitor() )
       {
          auto noReset = [] () {};
-         return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
       }
 
       int getPerformedLoops() const
-- 
GitLab


From 2bea9311ab9ed1211fb2dcaf6478312c79c9a7ff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 19:53:58 +0200
Subject: [PATCH 14/23] Benchmarks: compute sample standard deviation of the
 measured computation times

---
 src/Benchmarks/Benchmarks.h    |  23 ++-
 src/Benchmarks/FunctionTimer.h | 160 +++++++--------
 src/Benchmarks/Logging.h       | 365 ++++++++++++++++-----------------
 3 files changed, 271 insertions(+), 277 deletions(-)

diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h
index c4ae772a7..683a18376 100644
--- a/src/Benchmarks/Benchmarks.h
+++ b/src/Benchmarks/Benchmarks.h
@@ -17,7 +17,6 @@
 #include "Logging.h"
 
 #include <iostream>
-#include <iomanip>
 #include <exception>
 #include <limits>
 
@@ -35,24 +34,24 @@ namespace Benchmarks {
 const double oneGB = 1024.0 * 1024.0 * 1024.0;
 
 
-
 struct BenchmarkResult
 {
    using HeaderElements = Logging::HeaderElements;
    using RowElements = Logging::RowElements;
 
-   double bandwidth = std::numeric_limits<double>::quiet_NaN();
    double time = std::numeric_limits<double>::quiet_NaN();
+   double stddev = std::numeric_limits<double>::quiet_NaN();
+   double bandwidth = std::numeric_limits<double>::quiet_NaN();
    double speedup = std::numeric_limits<double>::quiet_NaN();
 
    virtual HeaderElements getTableHeader() const
    {
-      return HeaderElements({"bandwidth", "time", "speedup"});
+      return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" });
    }
 
    virtual RowElements getRowElements() const
    {
-      return RowElements({ bandwidth, time, speedup });
+      return RowElements({ time, stddev, stddev / time, bandwidth, speedup });
    }
 };
 
@@ -200,21 +199,22 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
             if( this->reset )
-               result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
             if( this->reset )
-               result.time = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor );
             else
-               result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+               std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          this->performedLoops = functionTimer.getPerformedLoops();
       }
@@ -257,15 +257,16 @@ public:
          BenchmarkResult & result )
    {
       result.time = std::numeric_limits<double>::quiet_NaN();
+      result.stddev = std::numeric_limits<double>::quiet_NaN();
       FunctionTimer< Device > functionTimer;
       try {
          if( verbose > 1 ) {
             // run the monitor main loop
             Solvers::SolverMonitorThread monitor_thread( monitor );
-            result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
          else {
-            result.time = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
+            std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor );
          }
       }
       catch ( const std::exception& e ) {
diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h
index 1fad1e946..1edd61204 100644
--- a/src/Benchmarks/FunctionTimer.h
+++ b/src/Benchmarks/FunctionTimer.h
@@ -17,6 +17,7 @@
 
 #include <TNL/Timer.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Vector.h>
 #include <TNL/Solvers/IterativeSolverMonitor.h>
 
 namespace TNL {
@@ -25,101 +26,94 @@ namespace Benchmarks {
 template< typename Device >
 class FunctionTimer
 {
-   public:
-      using DeviceType = Device;
-
-      template< typename ComputeFunction,
-                typename ResetFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      double
-      timeFunction( ComputeFunction compute,
-                    ResetFunction reset,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor(),
-                    bool performReset = true )
+public:
+   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
+   // computation times and stddev is the sample standard deviation
+   template< typename ComputeFunction,
+             typename ResetFunction,
+             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+   std::pair< double, double >
+   timeFunction( ComputeFunction compute,
+                 ResetFunction reset,
+                 int maxLoops,
+                 const double& minTime,
+                 int verbose = 1,
+                 Monitor && monitor = Monitor() )
+   {
+      // the timer is constructed zero-initialized and stopped
+      Timer timer;
+
+      // set timer to the monitor
+      if( verbose > 1 )
+         monitor.setTimer( timer );
+
+      // warm up
+      reset();
+      compute();
+
+      Containers::Vector< double > results( maxLoops );
+      results.setValue( 0.0 );
+
+      for( loops = 0;
+           loops < maxLoops || sum( results ) < minTime;
+           loops++ )
       {
-         // the timer is constructed zero-initialized and stopped
-         Timer timer;
-
-         // set timer to the monitor
-         if( verbose > 1 )
-            monitor.setTimer( timer );
-
-         // warm up
+         // abuse the monitor's "time" for loops
+         monitor.setTime( loops + 1 );
          reset();
-         compute();
 
-         // If we do not perform reset function and don't need
-         // the monitor, the timer is not interrupted after each loop.
-         if( ! performReset && verbose < 2 )
-         {
-            // Explicit synchronization of the CUDA device
+         // Explicit synchronization of the CUDA device
 #ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
+         if( std::is_same< Device, Devices::Cuda >::value )
+            cudaDeviceSynchronize();
 #endif
-            timer.start();
 
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 loops++ )
-               compute();
-            // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-            if( std::is_same< Device, Devices::Cuda >::value )
-               cudaDeviceSynchronize();
-#endif
-            timer.stop();
-         }
-         else
-         {
-            for( loops = 0;
-                 loops < maxLoops || timer.getRealTime() < minTime;
-                 loops++ )
-            {
-               // abuse the monitor's "time" for loops
-               monitor.setTime( loops + 1 );
-               reset();
-
-               // Explicit synchronization of the CUDA device
-#ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
-#endif
-               timer.start();
-               compute();
+         // reset timer before each computation
+         timer.reset();
+         timer.start();
+         compute();
 #ifdef HAVE_CUDA
-               if( std::is_same< Device, Devices::Cuda >::value )
-                  cudaDeviceSynchronize();
+         if( std::is_same< Device, Devices::Cuda >::value )
+            cudaDeviceSynchronize();
 #endif
-               timer.stop();
-            }
-         }
-         return timer.getRealTime() / ( double ) loops;
-      }
+         timer.stop();
 
-      template< typename ComputeFunction,
-                typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
-      double
-      timeFunction( ComputeFunction compute,
-                    int maxLoops,
-                    const double& minTime,
-                    int verbose = 1,
-                    Monitor && monitor = Monitor() )
-      {
-         auto noReset = [] () {};
-         return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false );
+         results[ loops ] = timer.getRealTime();
       }
 
-      int getPerformedLoops() const
-      {
-         return this->loops;
+      const double mean = sum( results ) / (double) loops;
+      if( loops > 1 ) {
+         const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean );
+         return std::make_pair( mean, stddev );
       }
-
-   protected:
-      int loops;
+      else {
+         const double stddev = std::numeric_limits<double>::quiet_NaN();
+         return std::make_pair( mean, stddev );
+      }
+   }
+
+   // returns a pair of (mean, stddev) where mean is the arithmetic mean of the
+   // computation times and stddev is the sample standard deviation
+   template< typename ComputeFunction,
+             typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > >
+   std::pair< double, double >
+   timeFunction( ComputeFunction compute,
+                 int maxLoops,
+                 const double& minTime,
+                 int verbose = 1,
+                 Monitor && monitor = Monitor() )
+   {
+      auto noReset = [] () {};
+      return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor );
+   }
+
+   int getPerformedLoops() const
+   {
+      return this->loops;
+   }
+
+protected:
+   int loops;
 };
 
 } // namespace Benchmarks
diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h
index b10ab7199..61608d364 100644
--- a/src/Benchmarks/Logging.h
+++ b/src/Benchmarks/Logging.h
@@ -16,225 +16,224 @@
 #include <map>
 #include <vector>
 #include <iostream>
+#include <iomanip>
 #include <string>
 #include <sstream>
 
+#include <TNL/String.h>
+
 namespace TNL {
-   namespace Benchmarks {
+namespace Benchmarks {
 
 class Logging
 {
-   public:
-      using MetadataElement = std::pair< const char*, String >;
-      using MetadataMap = std::map< const char*, String >;
-      using MetadataColumns = std::vector<MetadataElement>;
-
-      using HeaderElements = std::vector< String >;
-      using RowElements = std::vector< double >;
-
-      Logging( int verbose = true )
-      : verbose(verbose)
-      {}
-
-      void
-      setVerbose( int verbose)
-      {
-         this->verbose = verbose;
-      }
-
-      void
-      writeTitle( const String & title )
-      {
-         if( verbose )
-            std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
-         log << ": title = " << title << std::endl;
-      }
-
-      void
-      writeMetadata( const MetadataMap & metadata )
-      {
+public:
+   using MetadataElement = std::pair< const char*, String >;
+   using MetadataMap = std::map< const char*, String >;
+   using MetadataColumns = std::vector<MetadataElement>;
+
+   using HeaderElements = std::vector< String >;
+   using RowElements = std::vector< double >;
+
+   Logging( int verbose = true )
+   : verbose(verbose)
+   {}
+
+   void
+   setVerbose( int verbose)
+   {
+      this->verbose = verbose;
+   }
+
+   void
+   writeTitle( const String & title )
+   {
+      if( verbose )
+         std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl;
+      log << ": title = " << title << std::endl;
+   }
+
+   void
+   writeMetadata( const MetadataMap & metadata )
+   {
+      if( verbose )
+         std::cout << "properties:" << std::endl;
+
+      for( auto & it : metadata ) {
          if( verbose )
-            std::cout << "properties:" << std::endl;
-
-         for( auto & it : metadata ) {
-            if( verbose )
-               std::cout << "   " << it.first << " = " << it.second << std::endl;
-            log << ": " << it.first << " = " << it.second << std::endl;
-         }
-         if( verbose )
-            std::cout << std::endl;
+            std::cout << "   " << it.first << " = " << it.second << std::endl;
+         log << ": " << it.first << " = " << it.second << std::endl;
       }
-
-      void
-      writeTableHeader( const String & spanningElement,
-                        const HeaderElements & subElements )
-      {
-         if( verbose && header_changed ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.first;
-            }
-
-            // spanning element is printed as usual column to stdout,
-            // but is excluded from header
-            std::cout << std::setw( 15 ) << "";
-
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 ) << it;
-            }
-            std::cout << std::endl;
-
-            header_changed = false;
-         }
-
-         // initial indent string
-         header_indent = "!";
-         log << std::endl;
+      if( verbose )
+         std::cout << std::endl;
+   }
+
+   void
+   writeTableHeader( const String & spanningElement,
+                     const HeaderElements & subElements )
+   {
+      if( verbose && header_changed ) {
          for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
+            std::cout << std::setw( 20 ) << it.first;
          }
 
-         // dump stacked spanning columns
-         if( horizontalGroups.size() > 0 )
-            while( horizontalGroups.back().second <= 0 ) {
-               horizontalGroups.pop_back();
-               header_indent.pop_back();
-            }
-         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-               header_indent += "!";
-            }
-         }
+         // spanning element is printed as usual column to stdout,
+         // but is excluded from header
+         std::cout << std::setw( 15 ) << "";
 
-         log << header_indent << " " << spanningElement << std::endl;
          for( auto & it : subElements ) {
-            log << header_indent << "! " << it << std::endl;
+            std::cout << std::setw( 15 ) << it;
          }
+         std::cout << std::endl;
 
-         if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second--;
-            header_indent.pop_back();
-         }
+         header_changed = false;
       }
 
-      void
-      writeTableRow( const String & spanningElement,
-                     const RowElements & subElements )
-      {
-         if( verbose ) {
-            for( auto & it : metadataColumns ) {
-               std::cout << std::setw( 20 ) << it.second;
-            }
-            // spanning element is printed as usual column to stdout
-            std::cout << std::setw( 15 ) << spanningElement;
-            for( auto & it : subElements ) {
-               std::cout << std::setw( 15 );
-               if( it != 0.0 )std::cout << it;
-               else std::cout << "N/A";
-            }
-            std::cout << std::endl;
-         }
-
-         // only when changed (the header has been already adjusted)
-         // print each element on separate line
-         for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-         }
-
-         // benchmark data are indented
-         const String indent = "    ";
-         for( auto & it : subElements ) {
-            if( it != 0.0 ) log << indent << it << std::endl;
-            else log << indent << "N/A" << std::endl;
-         }
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
       }
 
-      void
-      writeErrorMessage( const char* msg,
-                         int colspan = 1 )
-      {
-         // initial indent string
-         header_indent = "!";
-         log << std::endl;
-         for( auto & it : metadataColumns ) {
-            log << header_indent << " " << it.first << std::endl;
-         }
-
-         // make sure there is a header column for the message
-         if( horizontalGroups.size() == 0 )
-            horizontalGroups.push_back( {"", 1} );
-
-         // dump stacked spanning columns
+      // dump stacked spanning columns
+      if( horizontalGroups.size() > 0 )
          while( horizontalGroups.back().second <= 0 ) {
             horizontalGroups.pop_back();
             header_indent.pop_back();
          }
-         for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
-            if( horizontalGroups[ i ].second > 0 ) {
-               log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
-               header_indent += "!";
-            }
-         }
-         if( horizontalGroups.size() > 0 ) {
-            horizontalGroups.back().second -= colspan;
-            header_indent.pop_back();
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
          }
+      }
 
-         // only when changed (the header has been already adjusted)
-         // print each element on separate line
-         for( auto & it : metadataColumns ) {
-            log << it.second << std::endl;
-         }
-         log << msg << std::endl;
+      log << header_indent << " " << spanningElement << std::endl;
+      for( auto & it : subElements ) {
+         log << header_indent << "! " << it << std::endl;
       }
 
-      void
-      closeTable()
-      {
-         log << std::endl;
-         header_indent = body_indent = "";
-         header_changed = true;
-         horizontalGroups.clear();
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second--;
+         header_indent.pop_back();
       }
+   }
 
-      bool save( std::ostream & logFile )
-      {
-         closeTable();
-         logFile << log.str();
-         if( logFile.good() ) {
-            log.str() = "";
-            return true;
+   void
+   writeTableRow( const String & spanningElement,
+                  const RowElements & subElements )
+   {
+      if( verbose ) {
+         for( auto & it : metadataColumns ) {
+            std::cout << std::setw( 20 ) << it.second;
+         }
+         // spanning element is printed as usual column to stdout
+         std::cout << std::setw( 15 ) << spanningElement;
+         for( auto & it : subElements ) {
+            std::cout << std::setw( 15 );
+            if( it != 0.0 )std::cout << it;
+            else std::cout << "N/A";
          }
-         return false;
+         std::cout << std::endl;
       }
 
-   protected:
-
-      // manual double -> String conversion with fixed precision
-      static String
-      _to_string( double num, int precision = 0, bool fixed = false )
-      {
-         std::stringstream str;
-         if( fixed )
-            str << std::fixed;
-         if( precision )
-            str << std::setprecision( precision );
-         str << num;
-         return String( str.str().data() );
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
       }
 
-      std::stringstream log;
-      std::string header_indent;
-      std::string body_indent;
-
-      int verbose;
-      MetadataColumns metadataColumns;
-      bool header_changed = true;
-      std::vector< std::pair< String, int > > horizontalGroups;
-};
+      // benchmark data are indented
+      const String indent = "    ";
+      for( auto & it : subElements ) {
+         if( it != 0.0 ) log << indent << it << std::endl;
+         else log << indent << "N/A" << std::endl;
+      }
+   }
+
+   void
+   writeErrorMessage( const char* msg,
+                      int colspan = 1 )
+   {
+      // initial indent string
+      header_indent = "!";
+      log << std::endl;
+      for( auto & it : metadataColumns ) {
+         log << header_indent << " " << it.first << std::endl;
+      }
 
+      // make sure there is a header column for the message
+      if( horizontalGroups.size() == 0 )
+         horizontalGroups.push_back( {"", 1} );
 
-   } // namespace Benchmarks
-} // namespace TNL
+      // dump stacked spanning columns
+      while( horizontalGroups.back().second <= 0 ) {
+         horizontalGroups.pop_back();
+         header_indent.pop_back();
+      }
+      for( size_t i = 0; i < horizontalGroups.size(); i++ ) {
+         if( horizontalGroups[ i ].second > 0 ) {
+            log << header_indent << " " << horizontalGroups[ i ].first << std::endl;
+            header_indent += "!";
+         }
+      }
+      if( horizontalGroups.size() > 0 ) {
+         horizontalGroups.back().second -= colspan;
+         header_indent.pop_back();
+      }
 
+      // only when changed (the header has been already adjusted)
+      // print each element on separate line
+      for( auto & it : metadataColumns ) {
+         log << it.second << std::endl;
+      }
+      log << msg << std::endl;
+   }
+
+   void
+   closeTable()
+   {
+      log << std::endl;
+      header_indent = body_indent = "";
+      header_changed = true;
+      horizontalGroups.clear();
+   }
+
+   bool save( std::ostream & logFile )
+   {
+      closeTable();
+      logFile << log.str();
+      if( logFile.good() ) {
+         log.str() = "";
+         return true;
+      }
+      return false;
+   }
+
+protected:
+   // manual double -> String conversion with fixed precision
+   static String
+   _to_string( double num, int precision = 0, bool fixed = false )
+   {
+      std::stringstream str;
+      if( fixed )
+         str << std::fixed;
+      if( precision )
+         str << std::setprecision( precision );
+      str << num;
+      return String( str.str().data() );
+   }
+
+   std::stringstream log;
+   std::string header_indent;
+   std::string body_indent;
+
+   int verbose;
+   MetadataColumns metadataColumns;
+   bool header_changed = true;
+   std::vector< std::pair< String, int > > horizontalGroups;
+};
 
+} // namespace Benchmarks
+} // namespace TNL
-- 
GitLab


From 7cc55deee1cb6f5610d152ed2fe7e9eec48033af Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 22:18:47 +0200
Subject: [PATCH 15/23] Implemented parallel prefix-sum with OpenMP

Fixes #42
---
 src/TNL/Containers/Algorithms/PrefixSum.hpp | 71 +++++++++++++++++++--
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index 1ffd4cf4e..59c160bf1 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -12,6 +12,8 @@
 
 #pragma once
 
+#include <memory>  // std::unique_ptr
+
 #include "PrefixSum.h"
 
 //#define CUDA_REDUCTION_PROFILING
@@ -56,21 +58,76 @@ perform( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   // TODO: parallelize with OpenMP
-   if( Type == PrefixSumType::Inclusive )
+#ifdef HAVE_OPENMP
+   const int threads = Devices::Host::getMaxThreadsCount();
+   std::unique_ptr< RealType[] > block_sums{
+      // Workaround for nvcc 10.1.168 - it would modifie the simple expression
+      // `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])`
+      // which is not correct - see e.g. https://stackoverflow.com/a/39671946
+      // Thus, the host compiler would spit out hundreds of warnings...
+      // Funnily enough, nvcc's behaviour depends on the context rather than the
+      // expression, because exactly the same simple expression in different places
+      // does not produce warnings.
+      #ifdef __NVCC__
+      new RealType[ static_cast<const int&>(threads) + 1 ]
+      #else
+      new RealType[ threads + 1 ]
+      #endif
+   };
+   block_sums[ 0 ] = zero;
+
+   #pragma omp parallel
+   {
+      // init
+      const int thread_idx = omp_get_thread_num();
+      RealType block_sum = zero;
+
+      // perform prefix-sum on blocks statically assigned to threads
+      if( Type == PrefixSumType::Inclusive ) {
+         #pragma omp for schedule(static)
+         for( IndexType i = begin; i < end; i++ ) {
+            reduction( block_sum, v[ i ] );
+            v[ i ] = block_sum;
+         }
+      }
+      else {
+         #pragma omp for schedule(static)
+         for( IndexType i = begin; i < end; i++ ) {
+            const RealType x = v[ i ];
+            v[ i ] = block_sum;
+            reduction( block_sum, x );
+         }
+      }
+
+      // write the block sums into the buffer
+      block_sums[ thread_idx + 1 ] = block_sum;
+      #pragma omp barrier
+
+      // calculate per-block offsets
+      RealType offset = 0;
+      for( int i = 0; i < thread_idx + 1; i++ )
+         reduction( offset, block_sums[ i ] );
+
+      // shift intermediate results by the offset
+      #pragma omp for schedule(static)
+      for( IndexType i = begin; i < end; i++ )
+         reduction( v[ i ], offset );
+   }
+#else
+   if( Type == PrefixSumType::Inclusive ) {
       for( IndexType i = begin + 1; i < end; i++ )
          reduction( v[ i ], v[ i - 1 ] );
+   }
    else // Exclusive prefix sum
    {
-      RealType aux( v[ begin ] );
-      v[ begin ] = zero;
-      for( IndexType i = begin + 1; i < end; i++ )
-      {
-         RealType x = v[ i ];
+      RealType aux = zero;
+      for( IndexType i = begin; i < end; i++ ) {
+         const RealType x = v[ i ];
          v[ i ] = aux;
          reduction( aux, x );
       }
    }
+#endif
 }
 
 ////
-- 
GitLab


From 27631930833ddf68032b41afb2aafc06fdaa66e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 22:19:11 +0200
Subject: [PATCH 16/23] Added prefix-sum to BLAS benchmarks

---
 src/Benchmarks/BLAS/vector-operations.h | 62 +++++++++++--------------
 1 file changed, 28 insertions(+), 34 deletions(-)

diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h
index c7f1cf751..80c63020d 100644
--- a/src/Benchmarks/BLAS/vector-operations.h
+++ b/src/Benchmarks/BLAS/vector-operations.h
@@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto l3normCudaET = [&]() {
       resultDevice = lpNorm( deviceView, 3.0 );
    };
-
    benchmark.setOperation( "l3 norm", datasetSize );
    benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost );
    benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET );
@@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    auto scalarProductCudaET = [&]() {
       resultDevice = ( deviceView, deviceView2 );
    };
-
 #ifdef HAVE_BLAS
    auto scalarProductBlas = [&]() {
       resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 );
@@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas );
 #endif
 
-   ////
-   // Prefix sum
-   /*
-   std::cout << "Benchmarking prefix-sum:" << std::endl;
-   timer.reset();
-   timer.start();
-   hostVector.computePrefixSum();
-   timer.stop();
-   timeHost = timer.getTime();
-   bandwidth = 2 * datasetSize / timer.getTime();
-   std::cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-
-   timer.reset();
-   timer.start();
-   deviceVector.computePrefixSum();
-   timer.stop();
-   timeDevice = timer.getTime();
-   bandwidth = 2 * datasetSize / timer.getTime();
-   std::cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl;
-   std::cout << "  CPU/GPU speedup: " << timeHost / timeDevice << std::endl;
-
-   HostVector auxHostVector;
-   auxHostVector.setLike( deviceVector );
-   auxHostVector = deviceVector;
-   for( int i = 0; i < size; i++ )
-      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
-      {
-         std::cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl;
-      }
-   */
-
-
    ////
    // Scalar multiplication
    auto multiplyHost = [&]() {
@@ -614,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark,
    benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas );
 #endif
 
+   ////
+   // Inclusive prefix sum
+   auto inclusivePrefixSumHost = [&]() {
+      hostVector.prefixSum();
+   };
+   auto inclusivePrefixSumCuda = [&]() {
+      deviceVector.prefixSum();
+   };
+   benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda );
+#endif
+
+   ////
+   // Exclusive prefix sum
+   auto exclusivePrefixSumHost = [&]() {
+      hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
+   };
+   auto exclusivePrefixSumCuda = [&]() {
+      deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >();
+   };
+   benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize );
+   benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost );
+#ifdef HAVE_CUDA
+   benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda );
+#endif
+
 #ifdef HAVE_CUDA
    cublasDestroy( cublasHandle );
 #endif
-- 
GitLab


From 8d0d2638648279850cd085dce774661fb0c307bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 14 Aug 2019 23:09:47 +0200
Subject: [PATCH 17/23] Removed volatile reduction from PrefixSum and updated
 the normal reduction operation

Same changes as for the regular Reduction operation...
---
 .../Algorithms/CudaPrefixSumKernel.h          | 81 +++++++++----------
 src/TNL/Containers/Algorithms/PrefixSum.h     | 26 ++----
 src/TNL/Containers/Algorithms/PrefixSum.hpp   | 42 ++++------
 src/TNL/Containers/Vector.hpp                 | 10 +--
 src/TNL/Containers/VectorView.hpp             | 10 +--
 .../Containers/VectorPrefixSumTest.h          | 12 +--
 6 files changed, 71 insertions(+), 110 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 0bfbe80ba..38eeb6d8d 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -24,13 +24,11 @@ namespace Algorithms {
 #ifdef HAVE_CUDA
 
 template< typename Real,
-          typename Operation,
-          typename VolatileOperation,
+          typename Reduction,
           typename Index >
 __global__ void
 cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
-                              Operation operation,
-                              VolatileOperation volatileOperation,
+                              Reduction reduction,
                               const Real zero,
                               const Index size,
                               const Index elementsInBlock,
@@ -40,8 +38,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
                               const Real gridShift )
 {
    Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
-   volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
-   volatile Real* warpSums = &auxData[ blockDim.x ];
+   Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
+   Real* warpSums = &auxData[ blockDim.x ];
 
    const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
    const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock );
@@ -70,7 +68,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
       }
    }
    if( blockIdx.x == 0 && threadIdx.x == 0 )
-      operation( sharedData[ 0 ], gridShift );
+      sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift );
 
    /***
     * Perform the sequential prefix-sum.
@@ -90,10 +88,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
-      operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
-                 sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
+      sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
+         reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
+                    sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
       auxData[ threadIdx.x ] =
-         sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer  ) ];
+         sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
       chunkPointer++;
    }
 
@@ -102,9 +101,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
     */
    const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize();
    const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize();
-   for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
+   for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
       if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
-         volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
+         auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
+      __syncwarp();
+   }
 
    if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 )
       warpSums[ warpIdx ] = auxData[ threadIdx.x ];
@@ -114,21 +115,23 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
     * Compute prefix-sum of warp sums using one warp
     */
    if( warpIdx == 0 )
-      for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 )
+      for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
          if( threadInWarpIdx >= stride )
-            volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
+            warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
+         __syncwarp();
+      }
    __syncthreads();
 
    /****
     * Shift the warp prefix-sums.
     */
    if( warpIdx > 0 )
-      volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
+      auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] );
+   __syncthreads();
 
    /***
     *  Store the result back in global memory.
     */
-   __syncthreads();
    idx = threadIdx.x;
    while( idx < elementsInBlock && blockOffset + idx < size )
    {
@@ -136,7 +139,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
       Real chunkShift( zero );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
-      operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
+      sharedData[ Devices::Cuda::getInterleaving( idx ) ] =
+         reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
       output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ];
       idx += blockDim.x;
    }
@@ -147,8 +151,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
       if( prefixSumType == PrefixSumType::Exclusive )
       {
          Real aux = zero;
-         operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
-         operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
+         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
+         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
          auxArray[ blockIdx.x ] = aux;
       }
       else
@@ -157,10 +161,10 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
 }
 
 template< typename Real,
-          typename Operation,
+          typename Reduction,
           typename Index >
 __global__ void
-cudaSecondPhaseBlockPrefixSum( Operation operation,
+cudaSecondPhaseBlockPrefixSum( Reduction reduction,
                                const Index size,
                                const Index elementsInBlock,
                                Real gridShift,
@@ -174,7 +178,7 @@ cudaSecondPhaseBlockPrefixSum( Operation operation,
       Index readIdx = threadIdx.x;
       while( readIdx < elementsInBlock && readOffset + readIdx < size )
       {
-         operation( data[ readIdx + readOffset ], shift );
+         data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
          readIdx += blockDim.x;
       }
    }
@@ -185,12 +189,10 @@ template< PrefixSumType prefixSumType,
           typename Index >
 struct CudaPrefixSumKernelLauncher
 {
-   template< typename Operation,
-             typename VolatileOperation >
+   template< typename Reduction >
    static void
    cudaRecursivePrefixSum( PrefixSumType prefixSumType_,
-                           Operation& operation,
-                           VolatileOperation& volatileOperation,
+                           Reduction& reduction,
                            const Real& zero,
                            const Index size,
                            const Index blockSize,
@@ -221,8 +223,7 @@ struct CudaPrefixSumKernelLauncher
       const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( Real );
       cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
          ( prefixSumType_,
-           operation,
-           volatileOperation,
+           reduction,
            zero,
            size,
            elementsInBlock,
@@ -242,8 +243,7 @@ struct CudaPrefixSumKernelLauncher
       Real gridShift2 = zero;
       if( numberOfBlocks > 1 )
          cudaRecursivePrefixSum( PrefixSumType::Inclusive,
-            operation,
-            volatileOperation,
+            reduction,
             zero,
             numberOfBlocks,
             blockSize,
@@ -254,7 +254,7 @@ struct CudaPrefixSumKernelLauncher
 
       //std::cerr << " auxArray2 = " << auxArray2 << std::endl;
       cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
-         ( operation,
+         ( reduction,
            size,
            elementsInBlock,
            gridShift,
@@ -273,25 +273,21 @@ struct CudaPrefixSumKernelLauncher
    /****
     * \brief Starts prefix sum in CUDA.
     *
-    * \tparam Operation operation to be performed on particular elements - addition usually
-    * \tparam VolatileOperation - volatile version of Operation
+    * \tparam Reduction reduction to be performed on particular elements - addition usually
     * \param size is number of elements to be scanned
     * \param blockSize is CUDA block size
     * \param deviceInput is pointer to input data on GPU
     * \param deviceOutput is pointer to resulting array, can be the same as input
-    * \param operation is instance of Operation
-    * \param volatileOperation is instance of VolatileOperation
-    * \param zero is neutral element for given Operation
+    * \param reduction is instance of Reduction
+    * \param zero is neutral element for given Reduction
     */
-   template< typename Operation,
-             typename VolatileOperation >
+   template< typename Reduction >
    static void
    start( const Index size,
       const Index blockSize,
       const Real *deviceInput,
       Real* deviceOutput,
-      Operation& operation,
-      VolatileOperation& volatileOperation,
+      Reduction& reduction,
       const Real& zero )
    {
       /****
@@ -319,8 +315,7 @@ struct CudaPrefixSumKernelLauncher
 
          //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
          cudaRecursivePrefixSum( prefixSumType,
-            operation,
-            volatileOperation,
+            reduction,
             zero,
             currentSize,
             blockSize,
@@ -371,5 +366,3 @@ int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1;
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
-
-
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h
index 53d6d7f83..fb90ccb44 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.h
+++ b/src/TNL/Containers/Algorithms/PrefixSum.h
@@ -38,14 +38,12 @@ class PrefixSum< Devices::Host, Type >
 {
    public:
       template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation >
+                typename Reduction >
       static void
       perform( Vector& v,
                const typename Vector::IndexType begin,
                const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
+               const Reduction& reduction,
                const typename Vector::RealType& zero );
 };
 
@@ -54,14 +52,12 @@ class PrefixSum< Devices::Cuda, Type >
 {
    public:
       template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation >
+                typename Reduction >
       static void
       perform( Vector& v,
                const typename Vector::IndexType begin,
                const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
+               const Reduction& reduction,
                const typename Vector::RealType& zero );
 };
 
@@ -70,16 +66,14 @@ class SegmentedPrefixSum< Devices::Host, Type >
 {
    public:
       template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation,
+                typename Reduction,
                 typename Flags >
       static void
       perform( Vector& v,
                Flags& flags,
                const typename Vector::IndexType begin,
                const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
+               const Reduction& reduction,
                const typename Vector::RealType& zero );
 };
 
@@ -88,21 +82,17 @@ class SegmentedPrefixSum< Devices::Cuda, Type >
 {
    public:
       template< typename Vector,
-                typename PrefixSumOperation,
-                typename VolatilePrefixSumOperation,
+                typename Reduction,
                 typename Flags >
       static void
       perform( Vector& v,
                Flags& flags,
                const typename Vector::IndexType begin,
                const typename Vector::IndexType end,
-               PrefixSumOperation& reduction,
-               VolatilePrefixSumOperation& volatilePrefixSum,
+               const Reduction& reduction,
                const typename Vector::RealType& zero );
 };
 
-
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index 59c160bf1..0001a2f75 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -44,15 +44,13 @@ static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//25
 // PrefixSum on host
 template< PrefixSumType Type >
 template< typename Vector,
-          typename PrefixSumOperation,
-          typename VolatilePrefixSumOperation >
+          typename Reduction >
 void
 PrefixSum< Devices::Host, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatilePrefixSum,
+         const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
    using RealType = typename Vector::RealType;
@@ -86,7 +84,7 @@ perform( Vector& v,
       if( Type == PrefixSumType::Inclusive ) {
          #pragma omp for schedule(static)
          for( IndexType i = begin; i < end; i++ ) {
-            reduction( block_sum, v[ i ] );
+            block_sum = reduction( block_sum, v[ i ] );
             v[ i ] = block_sum;
          }
       }
@@ -95,7 +93,7 @@ perform( Vector& v,
          for( IndexType i = begin; i < end; i++ ) {
             const RealType x = v[ i ];
             v[ i ] = block_sum;
-            reduction( block_sum, x );
+            block_sum = reduction( block_sum, x );
          }
       }
 
@@ -106,17 +104,17 @@ perform( Vector& v,
       // calculate per-block offsets
       RealType offset = 0;
       for( int i = 0; i < thread_idx + 1; i++ )
-         reduction( offset, block_sums[ i ] );
+         offset = reduction( offset, block_sums[ i ] );
 
       // shift intermediate results by the offset
       #pragma omp for schedule(static)
       for( IndexType i = begin; i < end; i++ )
-         reduction( v[ i ], offset );
+         v[ i ] = reduction( v[ i ], offset );
    }
 #else
    if( Type == PrefixSumType::Inclusive ) {
       for( IndexType i = begin + 1; i < end; i++ )
-         reduction( v[ i ], v[ i - 1 ] );
+         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
    }
    else // Exclusive prefix sum
    {
@@ -124,7 +122,7 @@ perform( Vector& v,
       for( IndexType i = begin; i < end; i++ ) {
          const RealType x = v[ i ];
          v[ i ] = aux;
-         reduction( aux, x );
+         aux = reduction( aux, x );
       }
    }
 #endif
@@ -134,15 +132,13 @@ perform( Vector& v,
 // PrefixSum on CUDA device
 template< PrefixSumType Type >
 template< typename Vector,
-          typename PrefixSumOperation,
-          typename VolatilePrefixSumOperation >
+          typename Reduction >
 void
 PrefixSum< Devices::Cuda, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatileReduction,
+         const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
    using RealType = typename Vector::RealType;
@@ -155,7 +151,6 @@ perform( Vector& v,
       &v[ begin ],
       &v[ begin ],
       reduction,
-      volatileReduction,
       zero );
 #endif
 }
@@ -165,8 +160,7 @@ perform( Vector& v,
 // PrefixSum on host
 template< PrefixSumType Type >
    template< typename Vector,
-             typename PrefixSumOperation,
-             typename VolatilePrefixSumOperation,
+             typename Reduction,
              typename Flags >
 void
 SegmentedPrefixSum< Devices::Host, Type >::
@@ -174,8 +168,7 @@ perform( Vector& v,
          Flags& flags,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatilePrefixSum,
+         const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
    using RealType = typename Vector::RealType;
@@ -186,7 +179,7 @@ perform( Vector& v,
    {
       for( IndexType i = begin + 1; i < end; i++ )
          if( ! flags[ i ] )
-            reduction( v[ i ], v[ i - 1 ] );
+            v[ i ] = reduction( v[ i ], v[ i - 1 ] );
    }
    else // Exclusive prefix sum
    {
@@ -198,7 +191,7 @@ perform( Vector& v,
          if( flags[ i ] )
             aux = zero;
          v[ i ] = aux;
-         reduction( aux, x );
+         aux = reduction( aux, x );
       }
    }
 }
@@ -207,8 +200,7 @@ perform( Vector& v,
 // PrefixSum on CUDA device
 template< PrefixSumType Type >
    template< typename Vector,
-             typename PrefixSumOperation,
-             typename VolatilePrefixSumOperation,
+             typename Reduction,
              typename Flags >
 void
 SegmentedPrefixSum< Devices::Cuda, Type >::
@@ -216,8 +208,7 @@ perform( Vector& v,
          Flags& flags,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
-         PrefixSumOperation& reduction,
-         VolatilePrefixSumOperation& volatileReduction,
+         const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
    using RealType = typename Vector::RealType;
@@ -231,7 +222,6 @@ perform( Vector& v,
       &v[ begin ],
       &v[ begin ],
       reduction,
-      volatileReduction,
       zero );*/
 #endif
 }
diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp
index 525ba6380..a8c626ee5 100644
--- a/src/TNL/Containers/Vector.hpp
+++ b/src/TNL/Containers/Vector.hpp
@@ -175,10 +175,7 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -193,10 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp
index 057d402d7..9cdde8ef2 100644
--- a/src/TNL/Containers/VectorView.hpp
+++ b/src/TNL/Containers/VectorView.hpp
@@ -125,10 +125,7 @@ prefixSum( IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, reduction, volatileReduction, (RealType) 0.0 );
+   Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
@@ -142,10 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end )
 {
    if( end == 0 )
       end = this->getSize();
-
-   auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; };
-   auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; };
-   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, reduction, volatileReduction, (RealType) 0.0 );
+   Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 template< typename Real,
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index aba569df7..3f0eee391 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -189,7 +189,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
@@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
       v_host = v_view;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
-- 
GitLab


From af6d1d6b6173b9ab3c725df2dc4cdbba9cae22a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 15 Aug 2019 13:22:42 +0200
Subject: [PATCH 18/23] Added default stream synchronizations after kernel
 launches in CudaPrefixSumKernel.h

---
 .../Containers/Algorithms/CudaPrefixSumKernel.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 38eeb6d8d..9d21e8970 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -220,7 +220,7 @@ struct CudaPrefixSumKernelLauncher
        */
       const std::size_t sharedDataSize = elementsInBlock +
                                          elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
-      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize()  ) * sizeof( Real );
+      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
       cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
          ( prefixSumType_,
            reduction,
@@ -231,6 +231,7 @@ struct CudaPrefixSumKernelLauncher
            output,
            auxArray1.getData(),
            gridShift );
+      cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
 
@@ -260,6 +261,7 @@ struct CudaPrefixSumKernelLauncher
            gridShift,
            auxArray2.getData(),
            output );
+      cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
       cudaMemcpy( &gridShift,
@@ -284,11 +286,11 @@ struct CudaPrefixSumKernelLauncher
    template< typename Reduction >
    static void
    start( const Index size,
-      const Index blockSize,
-      const Real *deviceInput,
-      Real* deviceOutput,
-      Reduction& reduction,
-      const Real& zero )
+          const Index blockSize,
+          const Real *deviceInput,
+          Real* deviceOutput,
+          Reduction& reduction,
+          const Real& zero )
    {
       /****
        * Compute the number of grids
@@ -323,11 +325,10 @@ struct CudaPrefixSumKernelLauncher
             gridShift,
             &deviceInput[ gridOffset ],
             &deviceOutput[ gridOffset ] );
-         TNL_CHECK_CUDA_DEVICE;
       }
 
       /***
-       * Store the number of CUDA grids for a purpose of unit testing, i.e.
+       * Store the number of CUDA grids for the purpose of unit testing, i.e.
        * to check if we test the algorithm with more than one CUDA grid.
        */
       gridsCount = numberOfGrids;
-- 
GitLab


From 1fe626408f7735f7c099a6c364736ff7a2ee8206 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Thu, 15 Aug 2019 14:47:28 +0200
Subject: [PATCH 19/23] Replaced static member variables in
 CudaPrefixSumKernelLauncher with static getters

---
 .../Algorithms/CudaPrefixSumKernel.h          | 41 ++++++++-----------
 src/TNL/Devices/Host.h                        | 12 +++---
 .../Containers/VectorPrefixSumTest.h          | 26 ++++++------
 3 files changed, 37 insertions(+), 42 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 9d21e8970..8ac802f96 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -297,8 +297,7 @@ struct CudaPrefixSumKernelLauncher
        */
       const Index elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize();
-      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize );
+      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
       Real gridShift = zero;
       //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
@@ -310,10 +309,10 @@ struct CudaPrefixSumKernelLauncher
          /****
           * Compute current grid size and size of data to be scanned
           */
-         const Index gridOffset = gridIdx * maxGridSize * elementsInBlock;
+         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
          Index currentSize = size - gridOffset;
-         if( currentSize / elementsInBlock > maxGridSize )
-            currentSize = maxGridSize * elementsInBlock;
+         if( currentSize / elementsInBlock > maxGridSize() )
+            currentSize = maxGridSize() * elementsInBlock;
 
          //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
          cudaRecursivePrefixSum( prefixSumType,
@@ -331,37 +330,31 @@ struct CudaPrefixSumKernelLauncher
        * Store the number of CUDA grids for the purpose of unit testing, i.e.
        * to check if we test the algorithm with more than one CUDA grid.
        */
-      gridsCount = numberOfGrids;
+      gridsCount() = numberOfGrids;
    }
 
    /****
     * The following serves for setting smaller maxGridSize so that we can force
     * the prefix sum in CUDA to run with more the one grids in unit tests.
     */
-   static void setMaxGridSize( int newMaxGridSize ) {
-      maxGridSize = newMaxGridSize;
+   static int& maxGridSize()
+   {
+      static int maxGridSize = Devices::Cuda::getMaxGridSize();
+      return maxGridSize;
    }
 
-   static void resetMaxGridSize() {
-      maxGridSize = Devices::Cuda::getMaxGridSize();
+   static void resetMaxGridSize()
+   {
+      maxGridSize() = Devices::Cuda::getMaxGridSize();
    }
 
-   static int maxGridSize;
-
-   static int gridsCount;
+   static int& gridsCount()
+   {
+      static int gridsCount = -1;
+      return gridsCount;
+   }
 };
 
-template< PrefixSumType prefixSumType,
-          typename Real,
-          typename Index >
-int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize();
-
-template< PrefixSumType prefixSumType,
-          typename Real,
-          typename Index >
-int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1;
-
-
 #endif
 
 } // namespace Algorithms
diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h
index 649749f58..40f55711a 100644
--- a/src/TNL/Devices/Host.h
+++ b/src/TNL/Devices/Host.h
@@ -113,17 +113,19 @@ public:
    }
 
    protected:
-      static bool& ompEnabled() {
+      static bool& ompEnabled()
+      {
 #ifdef HAVE_OPENMP
-         static bool ompEnabled( true );
+         static bool ompEnabled = true;
 #else
-         static bool ompEnabled( false );
+         static bool ompEnabled = false;
 #endif
          return ompEnabled;
       }
 
-      static int& maxThreadsCount() {
-         static int maxThreadsCount( -1 );
+      static int& maxThreadsCount()
+      {
+         static int maxThreadsCount = -1;
          return maxThreadsCount;
       }
 };
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 3f0eee391..3cba026e3 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -74,11 +74,11 @@ TYPED_TEST( VectorTest, prefixSum )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 );
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
       v = 0;
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -86,7 +86,7 @@ TYPED_TEST( VectorTest, prefixSum )
       setLinearSequence( v );
       v_host = -1;
       v.prefixSum();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
@@ -94,7 +94,7 @@ TYPED_TEST( VectorTest, prefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i + 1 );
@@ -102,7 +102,7 @@ TYPED_TEST( VectorTest, prefixSum )
       v = 0;
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -110,7 +110,7 @@ TYPED_TEST( VectorTest, prefixSum )
       setLinearSequence( v );
       v_host = -1;
       v_view.prefixSum();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1  );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
@@ -184,12 +184,12 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
    if( std::is_same< DeviceType, Devices::Cuda >::value )
    {
 #ifdef HAVE_CUDA
-      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 );
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
       setConstantSequence( v, 1 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -197,7 +197,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -205,7 +205,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
@@ -213,7 +213,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], i );
@@ -221,7 +221,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       v.setValue( 0 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
          EXPECT_EQ( v_host[ i ], 0 );
@@ -229,7 +229,7 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
       setLinearSequence( v );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 );
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v_view;
       for( int i = 1; i < size; i++ )
          EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
-- 
GitLab


From ac2ee07e3822d6a403a58d2042f8da52786ed59d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 16 Aug 2019 14:51:03 +0200
Subject: [PATCH 20/23] CUDA prefix-sum: moved gridShift from the first phase
 to the second phase

---
 .../Algorithms/CudaPrefixSumKernel.h          | 34 ++++++-------------
 1 file changed, 11 insertions(+), 23 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 8ac802f96..3153bae28 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -34,8 +34,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
                               const Index elementsInBlock,
                               const Real* input,
                               Real* output,
-                              Real* auxArray,
-                              const Real gridShift )
+                              Real* auxArray )
 {
    Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
    Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
@@ -67,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
          idx += blockDim.x;
       }
    }
-   if( blockIdx.x == 0 && threadIdx.x == 0 )
-      sharedData[ 0 ] = reduction( sharedData[ 0 ], gridShift );
 
    /***
     * Perform the sequential prefix-sum.
@@ -150,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    {
       if( prefixSumType == PrefixSumType::Exclusive )
       {
-         Real aux = zero;
-         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] );
-         aux = reduction( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
-         auxArray[ blockIdx.x ] = aux;
+         auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ],
+                                             sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
       }
       else
          auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
@@ -172,15 +167,13 @@ cudaSecondPhaseBlockPrefixSum( Reduction reduction,
                                Real* data )
 {
    if( blockIdx.x > 0 )
+      gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] );
+   const Index readOffset = blockIdx.x * elementsInBlock;
+   Index readIdx = threadIdx.x;
+   while( readIdx < elementsInBlock && readOffset + readIdx < size )
    {
-      const Real shift = auxArray[ blockIdx.x - 1 ];
-      const Index readOffset = blockIdx.x * elementsInBlock;
-      Index readIdx = threadIdx.x;
-      while( readIdx < elementsInBlock && readOffset + readIdx < size )
-      {
-         data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
-         readIdx += blockDim.x;
-      }
+      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift );
+      readIdx += blockDim.x;
    }
 }
 
@@ -229,8 +222,7 @@ struct CudaPrefixSumKernelLauncher
            elementsInBlock,
            input,
            output,
-           auxArray1.getData(),
-           gridShift );
+           auxArray1.getData() );
       cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
@@ -264,12 +256,8 @@ struct CudaPrefixSumKernelLauncher
       cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
-      cudaMemcpy( &gridShift,
-                  &auxArray2[ auxArraySize - 1 ],
-                  sizeof( Real ),
-                  cudaMemcpyDeviceToHost );
+      gridShift = auxArray2.getElement( auxArraySize - 1 );
       //std::cerr << "gridShift = " << gridShift << std::endl;
-      TNL_CHECK_CUDA_DEVICE;
    }
 
    /****
-- 
GitLab


From 2c40015f50fe443132325e2aebcd0f14322646a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 16 Aug 2019 20:12:38 +0200
Subject: [PATCH 21/23] CUDA prefix-sum: separated the implementation of the
 first and second phase

---
 .../Algorithms/CudaPrefixSumKernel.h          | 286 ++++++++++--------
 src/TNL/Containers/Algorithms/PrefixSum.hpp   |  30 +-
 2 files changed, 177 insertions(+), 139 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
index 3153bae28..ae3bb84de 100644
--- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h
@@ -31,7 +31,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
                               Reduction reduction,
                               const Real zero,
                               const Index size,
-                              const Index elementsInBlock,
+                              const int elementsInBlock,
                               const Real* input,
                               Real* output,
                               Real* auxArray )
@@ -46,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    /***
     * Load data into the shared memory.
     */
-   const Index blockOffset = blockIdx.x * elementsInBlock;
-   Index idx = threadIdx.x;
+   const int blockOffset = blockIdx.x * elementsInBlock;
+   int idx = threadIdx.x;
    if( prefixSumType == PrefixSumType::Exclusive )
    {
       if( idx == 0 )
@@ -81,7 +81,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
          sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];
    }
 
-   Index chunkPointer( 1 );
+   int chunkPointer = 1;
    while( chunkPointer < chunkSize &&
           chunkOffset + chunkPointer < lastElementInBlock )
    {
@@ -132,7 +132,7 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType,
    idx = threadIdx.x;
    while( idx < elementsInBlock && blockOffset + idx < size )
    {
-      const Index chunkIdx = idx / chunkSize;
+      const int chunkIdx = idx / chunkSize;
       Real chunkShift( zero );
       if( chunkIdx > 0 )
          chunkShift = auxData[ chunkIdx - 1 ];
@@ -161,18 +161,20 @@ template< typename Real,
 __global__ void
 cudaSecondPhaseBlockPrefixSum( Reduction reduction,
                                const Index size,
-                               const Index elementsInBlock,
-                               Real gridShift,
+                               const int elementsInBlock,
+                               const Index gridIdx,
+                               const Index maxGridSize,
                                const Real* auxArray,
-                               Real* data )
+                               Real* data,
+                               Real shift )
 {
-   if( blockIdx.x > 0 )
-      gridShift = reduction( gridShift, auxArray[ blockIdx.x - 1 ] );
-   const Index readOffset = blockIdx.x * elementsInBlock;
-   Index readIdx = threadIdx.x;
+   if( gridIdx > 0 || blockIdx.x > 0 )
+      shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] );
+   const int readOffset = blockIdx.x * elementsInBlock;
+   int readIdx = threadIdx.x;
    while( readIdx < elementsInBlock && readOffset + readIdx < size )
    {
-      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], gridShift );
+      data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift );
       readIdx += blockDim.x;
    }
 }
@@ -182,143 +184,183 @@ template< PrefixSumType prefixSumType,
           typename Index >
 struct CudaPrefixSumKernelLauncher
 {
+   /****
+    * \brief Performs both phases of prefix sum.
+    *
+    * \param size  Number of elements to be scanned.
+    * \param deviceInput  Pointer to input data on GPU.
+    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param zero  Neutral element for given reduction operation, i.e. value such that
+    *              `reduction(zero, x) == x` for any `x`.
+    * \param blockSize  The CUDA block size to be used for kernel launch.
+    */
    template< typename Reduction >
    static void
-   cudaRecursivePrefixSum( PrefixSumType prefixSumType_,
-                           Reduction& reduction,
-                           const Real& zero,
-                           const Index size,
-                           const Index blockSize,
-                           const Index elementsInBlock,
-                           Real& gridShift,
-                           const Real* input,
-                           Real* output )
+   perform( const Index size,
+            const Real* deviceInput,
+            Real* deviceOutput,
+            Reduction& reduction,
+            const Real zero,
+            const int blockSize = 256 )
+   {
+      const auto blockShifts = performFirstPhase(
+         size,
+         deviceInput,
+         deviceOutput,
+         reduction,
+         zero,
+         blockSize );
+      performSecondPhase(
+         size,
+         deviceOutput,
+         blockShifts.getData(),
+         reduction,
+         zero,
+         blockSize );
+   }
+
+   /****
+    * \brief Performs the first phase of prefix sum.
+    *
+    * \param size  Number of elements to be scanned.
+    * \param deviceInput  Pointer to input data on GPU.
+    * \param deviceOutput  Pointer to output array on GPU, can be the same as input.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param zero  Neutral value for given reduction operation, i.e. value such that
+    *              `reduction(zero, x) == x` for any `x`.
+    * \param blockSize  The CUDA block size to be used for kernel launch.
+    */
+   template< typename Reduction >
+   static auto
+   performFirstPhase( const Index size,
+                      const Real* deviceInput,
+                      Real* deviceOutput,
+                      Reduction& reduction,
+                      const Real zero,
+                      const int blockSize = 256 )
    {
+      // compute the number of grids
+      const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
-      const Index auxArraySize = numberOfBlocks;
-
-      Array< Real, Devices::Cuda > auxArray1, auxArray2;
-      auxArray1.setSize( auxArraySize );
-      auxArray2.setSize( auxArraySize );
-
-      /****
-       * Setup block and grid size.
-       */
-      dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
-      cudaBlockSize.x = blockSize;
-      cudaGridSize.x = roundUpDivision( size, elementsInBlock );
-
-      /****
-       * Run the kernel.
-       */
-      const std::size_t sharedDataSize = elementsInBlock +
-                                         elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
-      const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
-      cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
-         ( prefixSumType_,
-           reduction,
-           zero,
-           size,
-           elementsInBlock,
-           input,
-           output,
-           auxArray1.getData() );
+      const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
+      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
+
+      // allocate array for the block sums
+      Array< Real, Devices::Cuda > blockSums;
+      blockSums.setSize( numberOfBlocks );
+
+      // loop over all grids
+      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+         // compute current grid size and size of data to be scanned
+         const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
+         Index currentSize = size - gridOffset;
+         if( currentSize / elementsInBlock > maxGridSize() )
+            currentSize = maxGridSize() * elementsInBlock;
+         //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
+
+         // setup block and grid size
+         dim3 cudaBlockSize, cudaGridSize;
+         cudaBlockSize.x = blockSize;
+         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+
+         // run the kernel
+         const std::size_t sharedDataSize = elementsInBlock +
+                                            elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
+         const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
+         cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
+            ( prefixSumType,
+              reduction,
+              zero,
+              currentSize,
+              elementsInBlock,
+              &deviceInput[ gridOffset ],
+              &deviceOutput[ gridOffset ],
+              &blockSums[ gridIdx * maxGridSize() ] );
+      }
+
+      // synchronize the null-stream after all grids
       cudaStreamSynchronize(0);
       TNL_CHECK_CUDA_DEVICE;
 
-
-      //std::cerr << " auxArray1 = " << auxArray1 << std::endl;
-      /***
-       * In auxArray1 there is now a sum of numbers in each block.
-       * We must compute prefix-sum of auxArray1 and then shift
-       * each block.
-       */
-      Real gridShift2 = zero;
-      if( numberOfBlocks > 1 )
-         cudaRecursivePrefixSum( PrefixSumType::Inclusive,
+      // blockSums now contains sums of numbers in each block. The first phase
+      // ends by computing prefix-sum of this array.
+      if( numberOfBlocks > 1 ) {
+         CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform(
+            blockSums.getSize(),
+            blockSums.getData(),
+            blockSums.getData(),
             reduction,
             zero,
-            numberOfBlocks,
-            blockSize,
-            elementsInBlock,
-            gridShift2,
-            auxArray1.getData(),
-            auxArray2.getData() );
-
-      //std::cerr << " auxArray2 = " << auxArray2 << std::endl;
-      cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
-         ( reduction,
-           size,
-           elementsInBlock,
-           gridShift,
-           auxArray2.getData(),
-           output );
-      cudaStreamSynchronize(0);
-      TNL_CHECK_CUDA_DEVICE;
+            blockSize );
+      }
+
+      // Store the number of CUDA grids for the purpose of unit testing, i.e.
+      // to check if we test the algorithm with more than one CUDA grid.
+      gridsCount() = numberOfGrids;
 
-      gridShift = auxArray2.getElement( auxArraySize - 1 );
-      //std::cerr << "gridShift = " << gridShift << std::endl;
+      // blockSums now contains shift values for each block - to be used in the second phase
+      return blockSums;
    }
 
    /****
-    * \brief Starts prefix sum in CUDA.
+    * \brief Performs the seocond phase of prefix sum.
     *
-    * \tparam Reduction reduction to be performed on particular elements - addition usually
-    * \param size is number of elements to be scanned
-    * \param blockSize is CUDA block size
-    * \param deviceInput is pointer to input data on GPU
-    * \param deviceOutput is pointer to resulting array, can be the same as input
-    * \param reduction is instance of Reduction
-    * \param zero is neutral element for given Reduction
+    * \param size  Number of elements to be scanned.
+    * \param deviceOutput  Pointer to output array on GPU.
+    * \param blockShifts  Pointer to a GPU array containing the block shifts. It is the
+    *                     result of the first phase.
+    * \param reduction  Symmetric binary function representing the reduction operation
+    *                   (usually addition, i.e. an instance of \ref std::plus).
+    * \param shift  A constant shifting all elements of the array (usually `zero`, i.e.
+    *               the neutral value).
+    * \param blockSize  The CUDA block size to be used for kernel launch.
     */
    template< typename Reduction >
    static void
-   start( const Index size,
-          const Index blockSize,
-          const Real *deviceInput,
-          Real* deviceOutput,
-          Reduction& reduction,
-          const Real& zero )
+   performSecondPhase( const Index size,
+                       Real* deviceOutput,
+                       const Real* blockShifts,
+                       Reduction& reduction,
+                       const Real shift,
+                       const Index blockSize = 256 )
    {
-      /****
-       * Compute the number of grids
-       */
-      const Index elementsInBlock = 8 * blockSize;
+      // compute the number of grids
+      const int elementsInBlock = 8 * blockSize;
       const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
       const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
-      Real gridShift = zero;
-      //std::cerr << "numberOfgrids =  " << numberOfGrids << std::endl;
 
-      /****
-       * Loop over all grids.
-       */
-      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ )
-      {
-         /****
-          * Compute current grid size and size of data to be scanned
-          */
+      // loop over all grids
+      for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
+         // compute current grid size and size of data to be scanned
          const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock;
          Index currentSize = size - gridOffset;
          if( currentSize / elementsInBlock > maxGridSize() )
             currentSize = maxGridSize() * elementsInBlock;
-
          //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl;
-         cudaRecursivePrefixSum( prefixSumType,
-            reduction,
-            zero,
-            currentSize,
-            blockSize,
-            elementsInBlock,
-            gridShift,
-            &deviceInput[ gridOffset ],
-            &deviceOutput[ gridOffset ] );
+
+         // setup block and grid size
+         dim3 cudaBlockSize, cudaGridSize;
+         cudaBlockSize.x = blockSize;
+         cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock );
+
+         // run the kernel
+         cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>>
+            ( reduction,
+              size,
+              elementsInBlock,
+              gridIdx,
+              (Index) maxGridSize(),
+              blockShifts,
+              &deviceOutput[ gridOffset ],
+              shift );
       }
 
-      /***
-       * Store the number of CUDA grids for the purpose of unit testing, i.e.
-       * to check if we test the algorithm with more than one CUDA grid.
-       */
-      gridsCount() = numberOfGrids;
+      // synchronize the null-stream after all grids
+      cudaStreamSynchronize(0);
+      TNL_CHECK_CUDA_DEVICE;
    }
 
    /****
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index 0001a2f75..807e507ef 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -141,17 +141,18 @@ perform( Vector& v,
          const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
+#ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
-   using IndexType = typename Vector::IndexType;
-#ifdef HAVE_CUDA
-   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(
-      ( IndexType ) ( end - begin ),
-      ( IndexType ) 256,
-      &v[ begin ],
-      &v[ begin ],
+
+   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform(
+      end - begin,
+      &v[ begin ],  // input
+      &v[ begin ],  // output
       reduction,
       zero );
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
 }
 
@@ -211,18 +212,13 @@ perform( Vector& v,
          const Reduction& reduction,
          const typename Vector::RealType& zero )
 {
+#ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
-   using IndexType = typename Vector::IndexType;
-#ifdef HAVE_CUDA
-   throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET
-   /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start(
-      ( IndexType ) ( end - begin ),
-      ( IndexType ) 256,
-      &v[ begin ],
-      &v[ begin ],
-      reduction,
-      zero );*/
+
+   throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." );
+#else
+   throw Exceptions::CudaSupportMissing();
 #endif
 }
 
-- 
GitLab


From 174ad5fd2494c3128560e0d451ba89883e5d1097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 16 Aug 2019 21:05:19 +0200
Subject: [PATCH 22/23] PrefixSum: separate first and second phase for OpenMP
 implementation and expose performFirstPhase and performSecondPhase methods

---
 src/TNL/Containers/Algorithms/PrefixSum.h   | 128 ++++++++-----
 src/TNL/Containers/Algorithms/PrefixSum.hpp | 190 ++++++++++++++------
 2 files changed, 213 insertions(+), 105 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h
index fb90ccb44..2b0e40458 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.h
+++ b/src/TNL/Containers/Algorithms/PrefixSum.h
@@ -26,71 +26,107 @@ enum class PrefixSumType {
 
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class PrefixSum;
+struct PrefixSum;
 
 template< typename Device,
            PrefixSumType Type = PrefixSumType::Inclusive >
-class SegmentedPrefixSum;
+struct SegmentedPrefixSum;
 
 
 template< PrefixSumType Type >
-class PrefixSum< Devices::Host, Type >
+struct PrefixSum< Devices::Host, Type >
 {
-   public:
-      template< typename Vector,
-                typename Reduction >
-      static void
-      perform( Vector& v,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               const Reduction& reduction,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction >
+   static void
+   perform( Vector& v,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename Reduction >
+   static auto
+   performFirstPhase( Vector& v,
+                      const typename Vector::IndexType begin,
+                      const typename Vector::IndexType end,
+                      const Reduction& reduction,
+                      const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( Vector& v,
+                       const BlockShifts& blockShifts,
+                       const typename Vector::IndexType begin,
+                       const typename Vector::IndexType end,
+                       const Reduction& reduction,
+                       const typename Vector::RealType shift );
 };
 
 template< PrefixSumType Type >
-class PrefixSum< Devices::Cuda, Type >
+struct PrefixSum< Devices::Cuda, Type >
 {
-   public:
-      template< typename Vector,
-                typename Reduction >
-      static void
-      perform( Vector& v,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               const Reduction& reduction,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction >
+   static void
+   perform( Vector& v,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename Reduction >
+   static auto
+   performFirstPhase( Vector& v,
+                      const typename Vector::IndexType begin,
+                      const typename Vector::IndexType end,
+                      const Reduction& reduction,
+                      const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( Vector& v,
+                       const BlockShifts& blockShifts,
+                       const typename Vector::IndexType begin,
+                       const typename Vector::IndexType end,
+                       const Reduction& reduction,
+                       const typename Vector::RealType shift );
 };
 
 template< PrefixSumType Type >
-class SegmentedPrefixSum< Devices::Host, Type >
+struct SegmentedPrefixSum< Devices::Host, Type >
 {
-   public:
-      template< typename Vector,
-                typename Reduction,
-                typename Flags >
-      static void
-      perform( Vector& v,
-               Flags& flags,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               const Reduction& reduction,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
 };
 
 template< PrefixSumType Type >
-class SegmentedPrefixSum< Devices::Cuda, Type >
+struct SegmentedPrefixSum< Devices::Cuda, Type >
 {
-   public:
-      template< typename Vector,
-                typename Reduction,
-                typename Flags >
-      static void
-      perform( Vector& v,
-               Flags& flags,
-               const typename Vector::IndexType begin,
-               const typename Vector::IndexType end,
-               const Reduction& reduction,
-               const typename Vector::RealType& zero );
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp
index 807e507ef..8af19d09a 100644
--- a/src/TNL/Containers/Algorithms/PrefixSum.hpp
+++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp
@@ -12,69 +12,58 @@
 
 #pragma once
 
-#include <memory>  // std::unique_ptr
-
 #include "PrefixSum.h"
 
-//#define CUDA_REDUCTION_PROFILING
-
 #include <TNL/Assert.h>
-#include <TNL/Exceptions/CudaSupportMissing.h>
-#include <TNL/Containers/Algorithms/ArrayOperations.h>
+#include <TNL/Containers/Array.h>
 #include <TNL/Containers/Algorithms/CudaPrefixSumKernel.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Exceptions/NotImplementedError.h>
 
-#ifdef CUDA_REDUCTION_PROFILING
-#include <iostream>
-#include <TNL/Timer.h>
-#endif
-
 namespace TNL {
 namespace Containers {
 namespace Algorithms {
 
-/****
- * Arrays smaller than the following constant
- * are reduced on CPU. The constant must not be larger
- * than maximal CUDA grid size.
- */
-static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256;
-
-////
-// PrefixSum on host
 template< PrefixSumType Type >
-template< typename Vector,
-          typename Reduction >
+   template< typename Vector,
+             typename Reduction >
 void
 PrefixSum< Devices::Host, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType& zero )
+         const typename Vector::RealType zero )
+{
+#ifdef HAVE_OPENMP
+   const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
+   performSecondPhase( v, blockShifts, begin, end, reduction, zero );
+#else
+   // sequential prefix-sum does not need a second phase
+   performFirstPhase( v, begin, end, reduction, zero );
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+PrefixSum< Devices::Host, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::RealType zero )
 {
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
 #ifdef HAVE_OPENMP
    const int threads = Devices::Host::getMaxThreadsCount();
-   std::unique_ptr< RealType[] > block_sums{
-      // Workaround for nvcc 10.1.168 - it would modifie the simple expression
-      // `new RealType[reducedSize]` in the source code to `new (RealType[reducedSize])`
-      // which is not correct - see e.g. https://stackoverflow.com/a/39671946
-      // Thus, the host compiler would spit out hundreds of warnings...
-      // Funnily enough, nvcc's behaviour depends on the context rather than the
-      // expression, because exactly the same simple expression in different places
-      // does not produce warnings.
-      #ifdef __NVCC__
-      new RealType[ static_cast<const int&>(threads) + 1 ]
-      #else
-      new RealType[ threads + 1 ]
-      #endif
-   };
+   Array< RealType, Devices::Host > block_sums( threads + 1 );
    block_sums[ 0 ] = zero;
 
-   #pragma omp parallel
+   #pragma omp parallel num_threads(threads)
    {
       // init
       const int thread_idx = omp_get_thread_num();
@@ -99,18 +88,15 @@ perform( Vector& v,
 
       // write the block sums into the buffer
       block_sums[ thread_idx + 1 ] = block_sum;
-      #pragma omp barrier
+   }
 
-      // calculate per-block offsets
-      RealType offset = 0;
-      for( int i = 0; i < thread_idx + 1; i++ )
-         offset = reduction( offset, block_sums[ i ] );
+   // block_sums now contains sums of numbers in each block. The first phase
+   // ends by computing prefix-sum of this array.
+   for( int i = 1; i < threads + 1; i++ )
+      block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] );
 
-      // shift intermediate results by the offset
-      #pragma omp for schedule(static)
-      for( IndexType i = begin; i < end; i++ )
-         v[ i ] = reduction( v[ i ], offset );
-   }
+   // block_sums now contains shift values for each block - to be used in the second phase
+   return block_sums;
 #else
    if( Type == PrefixSumType::Inclusive ) {
       for( IndexType i = begin + 1; i < end; i++ )
@@ -125,21 +111,57 @@ perform( Vector& v,
          aux = reduction( aux, x );
       }
    }
+
+   return 0;
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+void
+PrefixSum< Devices::Host, Type >::
+performSecondPhase( Vector& v,
+                    const BlockShifts& blockShifts,
+                    const typename Vector::IndexType begin,
+                    const typename Vector::IndexType end,
+                    const Reduction& reduction,
+                    const typename Vector::RealType shift )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+#ifdef HAVE_OPENMP
+   const int threads = blockShifts.getSize() - 1;
+
+   // launch exactly the same number of threads as in the first phase
+   #pragma omp parallel num_threads(threads)
+   {
+      const int thread_idx = omp_get_thread_num();
+      const RealType offset = reduction( blockShifts[ thread_idx ], shift );
+
+      // shift intermediate results by the offset
+      #pragma omp for schedule(static)
+      for( IndexType i = begin; i < end; i++ )
+         v[ i ] = reduction( v[ i ], offset );
+   }
+#else
+   for( IndexType i = begin; i < end; i++ )
+      v[ i ] = reduction( v[ i ], shift );
 #endif
 }
 
-////
-// PrefixSum on CUDA device
 template< PrefixSumType Type >
-template< typename Vector,
-          typename Reduction >
+   template< typename Vector,
+             typename Reduction >
 void
 PrefixSum< Devices::Cuda, Type >::
 perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType& zero )
+         const typename Vector::RealType zero )
 {
 #ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
@@ -156,9 +178,61 @@ perform( Vector& v,
 #endif
 }
 
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+PrefixSum< Devices::Cuda, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::RealType zero )
+{
+#ifdef HAVE_CUDA
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase(
+      end - begin,
+      &v[ begin ],  // input
+      &v[ begin ],  // output
+      reduction,
+      zero );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
+template< PrefixSumType Type >
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+void
+PrefixSum< Devices::Cuda, Type >::
+performSecondPhase( Vector& v,
+                    const BlockShifts& blockShifts,
+                    const typename Vector::IndexType begin,
+                    const typename Vector::IndexType end,
+                    const Reduction& reduction,
+                    const typename Vector::RealType shift )
+{
+#ifdef HAVE_CUDA
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase(
+      end - begin,
+      &v[ begin ],  // output
+      blockShifts.getData(),
+      reduction,
+      shift );
+#else
+   throw Exceptions::CudaSupportMissing();
+#endif
+}
+
 
-////
-// PrefixSum on host
 template< PrefixSumType Type >
    template< typename Vector,
              typename Reduction,
@@ -170,7 +244,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType& zero )
+         const typename Vector::RealType zero )
 {
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
@@ -197,8 +271,6 @@ perform( Vector& v,
    }
 }
 
-////
-// PrefixSum on CUDA device
 template< PrefixSumType Type >
    template< typename Vector,
              typename Reduction,
@@ -210,7 +282,7 @@ perform( Vector& v,
          const typename Vector::IndexType begin,
          const typename Vector::IndexType end,
          const Reduction& reduction,
-         const typename Vector::RealType& zero )
+         const typename Vector::RealType zero )
 {
 #ifdef HAVE_CUDA
    using RealType = typename Vector::RealType;
-- 
GitLab


From d13a2d1870e714dbccd92623edd2a1e139462d0d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Fri, 16 Aug 2019 22:27:06 +0200
Subject: [PATCH 23/23] Implemented distributed prefix-sum

Fixes #43
---
 .../Algorithms/DistributedPrefixSum.h         |  70 +++++
 src/TNL/Containers/DistributedVector.h        |   9 +-
 src/TNL/Containers/DistributedVector.hpp      |  42 +--
 src/TNL/Containers/DistributedVectorView.h    |   9 +-
 src/TNL/Containers/DistributedVectorView.hpp  |  42 +--
 .../Containers/DistributedVectorTest.h        | 268 ++++++++++++++++--
 .../Containers/VectorPrefixSumTest.h          | 140 +++++----
 7 files changed, 418 insertions(+), 162 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/DistributedPrefixSum.h

diff --git a/src/TNL/Containers/Algorithms/DistributedPrefixSum.h b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h
new file mode 100644
index 000000000..b81e2ac94
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h
@@ -0,0 +1,70 @@
+/***************************************************************************
+                          PrefixSum.h  -  description
+                             -------------------
+    begin                : Aug 16, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Jakub Klinkovsky
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/PrefixSum.h>
+#include <TNL/Containers/Vector.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< PrefixSumType Type >
+struct DistributedPrefixSum
+{
+   template< typename DistributedVector,
+             typename Reduction >
+   static void
+   perform( DistributedVector& v,
+            typename DistributedVector::IndexType begin,
+            typename DistributedVector::IndexType end,
+            const Reduction& reduction,
+            const typename DistributedVector::RealType zero )
+   {
+      using RealType = typename DistributedVector::RealType;
+      using DeviceType = typename DistributedVector::DeviceType;
+      using CommunicatorType = typename DistributedVector::CommunicatorType;
+
+      const auto group = v.getCommunicationGroup();
+      if( group != CommunicatorType::NullGroup ) {
+         // adjust begin and end for the local range
+         const auto localRange = v.getLocalRange();
+         begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin();
+         end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin();
+
+         // perform first phase on the local data
+         auto localView = v.getLocalView();
+         const auto blockShifts = PrefixSum< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero );
+         const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 );
+
+         // exchange local sums between ranks
+         const int nproc = CommunicatorType::GetSize( group );
+         RealType dataForScatter[ nproc ];
+         for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum;
+         Vector< RealType, Devices::Host > rankSums( nproc );
+         // NOTE: exchanging general data types does not work with MPI
+         CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group );
+
+         // compute prefix-sum of the per-rank sums
+         PrefixSum< Devices::Host, PrefixSumType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero );
+
+         // perform second phase: shift by the per-block and per-rank offsets
+         const int rank = CommunicatorType::GetRank( group );
+         PrefixSum< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] );
+      }
+   }
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h
index 75f204dcc..3438ddbd0 100644
--- a/src/TNL/Containers/DistributedVector.h
+++ b/src/TNL/Containers/DistributedVector.h
@@ -127,13 +127,8 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVector& operator/=( const Vector& vector );
 
-   void computePrefixSum();
-
-   void computePrefixSum( IndexType begin, IndexType end );
-
-   void computeExclusivePrefixSum();
-
-   void computeExclusivePrefixSum( IndexType begin, IndexType end );
+   template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive >
+   void prefixSum( IndexType begin = 0, IndexType end = 0 );
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp
index f50624c61..0a6ac1547 100644
--- a/src/TNL/Containers/DistributedVector.hpp
+++ b/src/TNL/Containers/DistributedVector.hpp
@@ -13,7 +13,7 @@
 #pragma once
 
 #include "DistributedVector.h"
-#include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Containers/Algorithms/DistributedPrefixSum.h>
 
 namespace TNL {
 namespace Containers {
@@ -298,44 +298,14 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Communicator >
+   template< Algorithms::PrefixSumType Type >
 void
 DistributedVector< Real, Device, Index, Communicator >::
-computePrefixSum()
+prefixSum( IndexType begin, IndexType end )
 {
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum()
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVector< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
+   if( end == 0 )
+      end = this->getSize();
+   Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h
index 0ab923200..f0e7d9127 100644
--- a/src/TNL/Containers/DistributedVectorView.h
+++ b/src/TNL/Containers/DistributedVectorView.h
@@ -127,13 +127,8 @@ public:
              typename = std::enable_if_t< HasSubscriptOperator<Vector>::value > >
    DistributedVectorView& operator/=( const Vector& vector );
 
-   void computePrefixSum();
-
-   void computePrefixSum( IndexType begin, IndexType end );
-
-   void computeExclusivePrefixSum();
-
-   void computeExclusivePrefixSum( IndexType begin, IndexType end );
+   template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive >
+   void prefixSum( IndexType begin = 0, IndexType end = 0 );
 };
 
 } // namespace Containers
diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp
index 2f1e4666e..0268e35da 100644
--- a/src/TNL/Containers/DistributedVectorView.hpp
+++ b/src/TNL/Containers/DistributedVectorView.hpp
@@ -13,7 +13,7 @@
 #pragma once
 
 #include "DistributedVectorView.h"
-#include <TNL/Exceptions/NotImplementedError.h>
+#include <TNL/Containers/Algorithms/DistributedPrefixSum.h>
 
 namespace TNL {
 namespace Containers {
@@ -274,44 +274,14 @@ template< typename Real,
           typename Device,
           typename Index,
           typename Communicator >
+   template< Algorithms::PrefixSumType Type >
 void
 DistributedVectorView< Real, Device, Index, Communicator >::
-computePrefixSum()
+prefixSum( IndexType begin, IndexType end )
 {
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum()
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
-}
-
-template< typename Real,
-          typename Device,
-          typename Index,
-          typename Communicator >
-void
-DistributedVectorView< Real, Device, Index, Communicator >::
-computeExclusivePrefixSum( IndexType begin, IndexType end )
-{
-   throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet.");
+   if( end == 0 )
+      end = this->getSize();
+   Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 );
 }
 
 } // namespace Containers
diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h
index e2d07a02a..430a42d5e 100644
--- a/src/UnitTests/Containers/DistributedVectorTest.h
+++ b/src/UnitTests/Containers/DistributedVectorTest.h
@@ -43,32 +43,26 @@ protected:
    using VectorViewType = typename DistributedVectorType::LocalViewType;
    using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >;
 
-   const int globalSize = 97;  // prime number to force non-uniform distribution
-
    const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup;
 
-   DistributedVectorType x, y, z;
-
-   DistributedVectorView x_view, y_view, z_view;
+   DistributedVectorType v;
+   DistributedVectorView v_view;
+   typename DistributedVectorType::HostType v_host;
 
    const int rank = CommunicatorType::GetRank(group);
    const int nproc = CommunicatorType::GetSize(group);
 
+   // should be small enough to have fast tests, but large enough to test
+   // prefix-sum with multiple CUDA grids
+   const int globalSize = 10000 * nproc;
+
    DistributedVectorTest()
    {
       using LocalRangeType = typename DistributedVector::LocalRangeType;
       const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group );
-      x.setDistribution( localRange, globalSize, group );
-      y.setDistribution( localRange, globalSize, group );
-      z.setDistribution( localRange, globalSize, group );
-
-      x_view.bind( x );
-      y_view.bind( y );
-      z_view.bind( z );
-
-      setConstantSequence( x, 1 );
-      setLinearSequence( y );
-      setNegativeLinearSequence( z );
+      v.setDistribution( localRange, globalSize, group );
+      v_view.bind( v );
+      setConstantSequence( v, 1 );
    }
 };
 
@@ -85,7 +79,247 @@ using DistributedVectorTypes = ::testing::Types<
 
 TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes );
 
-// TODO: distributed prefix sum
+#if 1
+TYPED_TEST( DistributedVectorTest, prefixSum )
+{
+   using RealType = typename TestFixture::DistributedVectorType::RealType;
+   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
+   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+
+   auto& v = this->v;
+   auto& v_view = this->v_view;
+   auto& v_host = this->v_host;
+   const auto localRange = v.getLocalRange();
+
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v_view.prefixSum();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v_view.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+#endif
+
+#if 1
+TYPED_TEST( DistributedVectorTest, exclusivePrefixSum )
+{
+   using RealType = typename TestFixture::DistributedVectorType::RealType;
+   using DeviceType = typename TestFixture::DistributedVectorType::DeviceType;
+   using IndexType = typename TestFixture::DistributedVectorType::IndexType;
+
+   auto& v = this->v;
+   auto& v_view = this->v_view;
+   auto& v_host = this->v_host;
+   const auto localRange = v.getLocalRange();
+
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
+
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   // test views
+   setConstantSequence( v, 0 );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v_view;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
+
+   setLinearSequence( v );
+   v_host = -1;
+   v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+   v_host = v;
+   for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+   ////
+   // With CUDA, perform tests with multiple CUDA grids.
+   if( std::is_same< DeviceType, Devices::Cuda >::value )
+   {
+#ifdef HAVE_CUDA
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
+
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      // test views
+      setConstantSequence( v, 0 );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], 0 );
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], i );
+
+      setLinearSequence( v );
+      v_host = -1;
+      v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v;
+      for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
+      Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
+#endif
+   }
+}
+#endif
 
 #endif  // HAVE_GTEST
 
diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h
index 3cba026e3..4659d365d 100644
--- a/src/UnitTests/Containers/VectorPrefixSumTest.h
+++ b/src/UnitTests/Containers/VectorPrefixSumTest.h
@@ -26,48 +26,56 @@ TYPED_TEST( VectorTest, prefixSum )
    using IndexType = typename VectorType::IndexType;
    const int size = VECTOR_TEST_SIZE;
 
-   if( std::is_same< RealType, float >::value ||
-       std::is_same< IndexType, short >::value )
-   return;
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
+      return;
 
    VectorType v( size );
    ViewType v_view( v );
    typename VectorType::HostType v_host( size );
 
-   v = 0;
+   setConstantSequence( v, 0 );
    v_host = -1;
    v.prefixSum();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+   setConstantSequence( v, 1 );
+   v_host = -1;
+   v.prefixSum();
+   v_host = v_view;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v.prefixSum();
    v_host = v;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-   setConstantSequence( v, 1 );
+   // test views
+   setConstantSequence( v, 0 );
    v_host = -1;
    v_view.prefixSum();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i + 1 );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v = 0;
+   setConstantSequence( v, 1 );
    v_host = -1;
    v_view.prefixSum();
    v_host = v_view;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v_view.prefixSum();
-   v_host = v_view;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
    ////
    // With CUDA, perform tests with multiple CUDA grids.
@@ -75,45 +83,56 @@ TYPED_TEST( VectorTest, prefixSum )
    {
 #ifdef HAVE_CUDA
       Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3;
-      v = 0;
+
+      setConstantSequence( v, 0 );
       v_host = -1;
       v.prefixSum();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
+
+      setConstantSequence( v, 1 );
+      v_host = -1;
+      v.prefixSum();
+      EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
+      v_host = v_view;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v.prefixSum();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
 
-      setConstantSequence( v, 1 );
+      // test views
+      setConstantSequence( v, 0 );
       v_host = -1;
       v_view.prefixSum();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i + 1 );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v = 0;
+      setConstantSequence( v, 1 );
       v_host = -1;
       v_view.prefixSum();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
       v_host = v_view;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v_view.prefixSum();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1  );
-      v_host = v_view;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i;
+
       Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
@@ -128,8 +147,8 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
    using IndexType = typename VectorType::IndexType;
    const int size = VECTOR_TEST_SIZE;
 
-   if( std::is_same< RealType, float >::value ||
-       std::is_same< IndexType, short >::value )
+   // FIXME: tests should work in all cases
+   if( std::is_same< RealType, float >::value )
       return;
 
    VectorType v;
@@ -137,47 +156,48 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
    ViewType v_view( v );
    typename VectorType::HostType v_host( size );
 
-   setConstantSequence( v, 1 );
+   setConstantSequence( v, 0 );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v.setValue( 0 );
+   setConstantSequence( v, 1 );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
    v_host = v;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-   setConstantSequence( v, 1 );
+   // test views
+   setConstantSequence( v, 0 );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], i );
+      EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-   v.setValue( 0 );
+   setConstantSequence( v, 1 );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
+   v_host = v;
    for( int i = 0; i < size; i++ )
-      EXPECT_EQ( v_host[ i ], 0 );
+      EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
    setLinearSequence( v );
    v_host = -1;
    v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
-   v_host = v_view;
-   for( int i = 1; i < size; i++ )
-      EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+   v_host = v;
+   for( int i = 0; i < size; i++ )
+      EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
    ////
    // With CUDA, perform tests with multiple CUDA grids.
@@ -186,53 +206,55 @@ TYPED_TEST( VectorTest, exclusivePrefixSum )
 #ifdef HAVE_CUDA
       Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3;
 
-      setConstantSequence( v, 1 );
+      setConstantSequence( v, 0 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v.setValue( 0 );
+      setConstantSequence( v, 1 );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
       v_host = v;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
 
-      setConstantSequence( v, 1 );
+      // test views
+      setConstantSequence( v, 0 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v_view;
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], i );
+         EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i;
 
-      v.setValue( 0 );
+      setConstantSequence( v, 1 );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v_view;
+      v_host = v;
       for( int i = 0; i < size; i++ )
-         EXPECT_EQ( v_host[ i ], 0 );
+         EXPECT_EQ( v_host[ i ], i ) << "i = " << i;
 
       setLinearSequence( v );
       v_host = -1;
       v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >();
       EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 );
-      v_host = v_view;
-      for( int i = 1; i < size; i++ )
-         EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 );
+      v_host = v;
+      for( int i = 0; i < size; i++ )
+         EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i;
+
       Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize();
 #endif
    }
-- 
GitLab