From c60aa9e42a4afdb71e317c753dfac73ddc19858c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 11 Apr 2019 21:53:36 +0200
Subject: [PATCH 1/9] Rewritting parallel reduciton with lambda functions.

---
 .../Algorithms/CudaReductionKernel.h          | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index a5823849d..08a767c60 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -39,14 +39,19 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
    static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
 #endif
 
-template< int blockSize, typename Operation, typename Index >
+template< int blockSize,
+   typename Real,
+   typename FirstPhase,
+   typename SecondPhase,
+   typename Index,
+   typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( Operation operation,
+CudaReductionKernel( const Real& initialValue,
+                     FirstReduction& firstReduction,
+                     SecondReduction& secondReduction,
                      const Index size,
-                     const typename Operation::DataType1* input1,
-                     const typename Operation::DataType2* input2,
-                     typename Operation::ResultType* output )
+                     ResultType* output )
 {
    typedef Index IndexType;
    typedef typename Operation::ResultType ResultType;
@@ -62,23 +67,23 @@ CudaReductionKernel( Operation operation,
          IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   sdata[ tid ] = operation.initialValue();
+   sdata[ tid ] = initialValue;
    /***
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
    while( gid + 4 * gridSize < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
+      sdata[ tid ] = firstReduction( sdata[ tid ], gid );
+      sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize );
+      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize );
+      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      firstReduction( sdata[ tid ], gid,                input1, input2 );
+      firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
       gid += 2 * gridSize;
    }
    while( gid < size )
-- 
GitLab


From 5c983e498eb31ae1d19d82e201fe589482b55a4c Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Fri, 12 Apr 2019 15:54:53 +0200
Subject: [PATCH 2/9] [WIP] Rewritting reduction using lambda functions.

---
 .../Algorithms/ArrayOperationsCuda.hpp        |   7 +-
 .../Algorithms/CudaReductionKernel.h          | 112 +++++++-------
 src/TNL/Containers/Algorithms/Reduction.h     |  49 +++---
 .../Containers/Algorithms/Reduction_impl.h    | 140 ++++++++++--------
 .../Algorithms/VectorOperationsHost_impl.h    |  24 ++-
 src/TNL/Math.h                                |  13 +-
 6 files changed, 196 insertions(+), 149 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 3efe7e4d4..ab8770013 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -220,9 +220,10 @@ containsValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
-   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
-   reductionContainsValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
+   //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
+   //reductionContainsValue.setValue( value );
+   auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
+   return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0  );
 }
 
 template< typename Element,
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 08a767c60..0a71aaa79 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -41,20 +41,21 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
 
 template< int blockSize,
    typename Real,
-   typename FirstPhase,
-   typename SecondPhase,
+   typename DataFetcher,
+   typename Reduction,
    typename Index,
-   typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) >
+   typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( const Real& initialValue,
-                     FirstReduction& firstReduction,
-                     SecondReduction& secondReduction,
+CudaReductionKernel( const Real& zero,
+                     const DataFetcher& dataFetcher,
+                     const Reduction& reduction,
                      const Index size,
-                     ResultType* output )
+                     Result* output )
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
+   using RealType = Real;
+   using IndexType = Index;
+   using ResultType = Result;
 
    ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
@@ -67,28 +68,28 @@ CudaReductionKernel( const Real& initialValue,
          IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   sdata[ tid ] = initialValue;
+   sdata[ tid ] = zero;
    /***
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
    while( gid + 4 * gridSize < size )
    {
-      sdata[ tid ] = firstReduction( sdata[ tid ], gid );
-      sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize );
-      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize );
-      sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size )
    {
-      firstReduction( sdata[ tid ], gid,                input1, input2 );
-      firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
+      sdata[ tid ] =  reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
@@ -103,19 +104,19 @@ CudaReductionKernel( const Real& initialValue,
    if( blockSize >= 1024 )
    {
       if( tid < 512 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
+         reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 )
    {
       if( tid < 256 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
+         reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 )
    {
       if( tid < 128 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
+         reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
       //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -123,7 +124,7 @@ CudaReductionKernel( const Real& initialValue,
    if( blockSize >= 128 )
    {
       if( tid <  64 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
+         reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -131,40 +132,40 @@ CudaReductionKernel( const Real& initialValue,
 
    /***
     * This runs in one warp so it is synchronized implicitly.
-    */   
+    */
    if( tid < 32 )
    {
       volatile ResultType* vsdata = sdata;
       if( blockSize >= 64 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
          //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
       if( blockSize >= 32 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
          //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >= 16 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
          //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  8 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
          //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  4 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
          //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  2 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
          //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
@@ -180,16 +181,21 @@ CudaReductionKernel( const Real& initialValue,
 
 }
 
-template< typename Operation, typename Index >
+template< typename Real,
+   typename DataFetcher,
+   typename Reduction,
+   typename Index,
+   typename Result >
 int
-CudaReductionKernelLauncher( Operation& operation,
-                             const Index size,
-                             const typename Operation::DataType1* input1,
-                             const typename Operation::DataType2* input2,
-                             typename Operation::ResultType*& output )
+CudaReductionKernelLauncher( const Index size,
+                             const Reduction& reduction,
+                             const DataFetcher& dataFetcher,
+                             const Real& zero,
+                             Result*& output )
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
+   using RealType = Real;
+   using IndexType = Index;
+   using ResultType = Result;
 
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
@@ -227,55 +233,55 @@ CudaReductionKernelLauncher( Operation& operation,
    {
       case 512:
          CudaReductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case   1:
          TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index d4f45f30e..6eb07f999 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -10,7 +10,7 @@
 
 // Implemented by: Tomas Oberhuber, Jakub Klinkovsky
 
-#pragma once 
+#pragma once
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
@@ -18,7 +18,7 @@
 
 namespace TNL {
 namespace Containers {
-namespace Algorithms {   
+namespace Algorithms {
 
 template< typename Device >
 class Reduction
@@ -29,36 +29,45 @@ template<>
 class Reduction< Devices::Cuda >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename ReductionOperation,
+             typename DataFetcher,
+             typename Result = decltype( DataFetcher::operator() ) >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 template<>
 class Reduction< Devices::Host >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename ReductionOperation,
+             typename DataFetcher,
+             typename Result = decltype( DataFetcher::operator() ) >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 template<>
 class Reduction< Devices::MIC >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename ReductionOperation,
+             typename DataFetcher,
+             typename Result = decltype( DataFetcher::operator() ) >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h
index b6a856249..375e6d004 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -38,35 +38,37 @@ namespace Algorithms {
  */
 static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
 
-template< typename Operation, typename Index >
-typename Operation::ResultType
+
+template< typename Index,
+          typename ReductionOperation,
+          typename DataFetcher,
+          typename Result >
+Result
 Reduction< Devices::Cuda >::
-reduce( Operation& operation,
-        const Index size,
-        const typename Operation::DataType1* deviceInput1,
-        const typename Operation::DataType2* deviceInput2 )
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero )
 {
 #ifdef HAVE_CUDA
 
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
+   using IndexType = Index;
+   using ResultType = Result;
 
    /***
     * Only fundamental and pointer types can be safely reduced on host. Complex
     * objects stored on the device might contain pointers into the device memory,
     * in which case reduction on host might fail.
     */
-   constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
+   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
    constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
 
    /***
     * First check if the input array(s) is/are large enough for the reduction on GPU.
     * Otherwise copy it/them to host and reduce on CPU.
     */
-   if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
+   // With lambda function we cannot reduce data on host - we do not know the data here.
+   /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
    {
       typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
@@ -74,12 +76,12 @@ reduce( Operation& operation,
          using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
          typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
          ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
-         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
+         return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 );
       }
       else {
          return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
       }
-   }
+   }*/
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -91,10 +93,10 @@ reduce( Operation& operation,
     * Reduce the data on the CUDA device.
     */
    ResultType* deviceAux1( 0 );
-   IndexType reducedSize = CudaReductionKernelLauncher( operation,
-                                                        size,
-                                                        deviceInput1,
-                                                        deviceInput2,
+   IndexType reducedSize = CudaReductionKernelLauncher( size,
+                                                        reduction,
+                                                        dataFetcher,
+                                                        zero,
                                                         deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
@@ -120,8 +122,8 @@ reduce( Operation& operation,
       /***
        * Reduce the data on the host system.
        */
-      LaterReductionOperation laterReductionOperation;
-      const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );
+      auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; };
+      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -134,12 +136,13 @@ reduce( Operation& operation,
       /***
        * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
        */
-      LaterReductionOperation laterReductionOperation;
+      //LaterReductionOperation laterReductionOperation;
+      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
       while( reducedSize > 1 ) {
-         reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
-                                                    reducedSize,
-                                                    deviceAux1,
-                                                    (ResultType*) 0,
+         reducedSize = CudaReductionKernelLauncher( reducedSize,
+                                                    reduction,
+                                                    copyFetch,
+                                                    zero,
                                                     deviceAux1 );
       }
 
@@ -166,18 +169,19 @@ reduce( Operation& operation,
 #endif
 };
 
-template< typename Operation, typename Index >
-typename Operation::ResultType
+template< typename Index,
+          typename ReductionOperation,
+          typename DataFetcher,
+          typename Result >
+Result
 Reduction< Devices::Host >::
-reduce( Operation& operation,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const typename Operation::DataType2* input2 )
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero )
 {
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
+   using IndexType = Index;
+   using ResultType = Result;
 
    constexpr int block_size = 128;
    const int blocks = size / block_size;
@@ -185,23 +189,24 @@ reduce( Operation& operation,
 #ifdef HAVE_OPENMP
    if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
       // global result variable
-      ResultType result = operation.initialValue();
+      ResultType result = zero;
 #pragma omp parallel
       {
          // initialize array for thread-local results
-         ResultType r[ 4 ] = { operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue() };
+         ResultType r[ 4 ] = { zero, zero, zero, zero  };
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
-               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
+               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
+               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
+               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
             }
          }
 
@@ -209,18 +214,19 @@ reduce( Operation& operation,
          #pragma omp single nowait
          {
             for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( r[ 0 ], i, input1, input2 );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
+               //operation.dataFetcher( r[ 0 ], i, input1, input2 );
          }
 
          // local reduction of unrolled results
-         operation.commonReduction( r[ 0 ], r[ 2 ] );
-         operation.commonReduction( r[ 1 ], r[ 3 ] );
-         operation.commonReduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            operation.commonReduction( result, r[ 0 ] );
+            result = reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -229,37 +235,43 @@ reduce( Operation& operation,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         ResultType r[ 4 ] = { operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue() };
+         ResultType r[ 4 ] = { zero, zero, zero, zero };
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
-               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+               /*operation.dataFetcher( r[ 0 ], offset + i,     input1, input2 );
+               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
+               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
+               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
          for( IndexType i = blocks * block_size; i < size; i++ )
-            operation.firstReduction( r[ 0 ], i, input1, input2 );
+            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
+            //operation.dataFetcher( r[ 0 ], i, input1, input2 );
 
          // reduction of unrolled results
-         operation.commonReduction( r[ 0 ], r[ 2 ] );
-         operation.commonReduction( r[ 1 ], r[ 3 ] );
-         operation.commonReduction( r[ 0 ], r[ 1 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
+
+         /*operation.reduction( r[ 0 ], r[ 2 ] );
+         operation.reduction( r[ 1 ], r[ 3 ] );
+         operation.reduction( r[ 0 ], r[ 1 ] );*/
 
          return r[ 0 ];
       }
       else {
-         ResultType result = operation.initialValue();
+         ResultType result = zero;
          for( IndexType i = 0; i < size; i++ )
-            operation.firstReduction( result, i, input1, input2 );
+            result = reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
index 67400da64..d324806ab 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
@@ -46,15 +46,21 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorMax( const Vector& v )
 {
-   typedef typename Vector::RealType RealType;
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   Algorithms::ParallelReductionMax< RealType, ResultType > operation;
+   /*Algorithms::ParallelReductionMax< RealType, ResultType > operation;
    return Reduction< Devices::Host >::reduce( operation,
                                               v.getSize(),
                                               v.getData(),
-                                              ( RealType* ) 0 );
+                                              ( RealType* ) 0 );*/
+   const RealType* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); };
+   return Reduction< Devices::Host >::reduce( v.getSize(), reduction, fetch, 0.0 );
+
 }
 
 template< typename Vector, typename ResultType >
@@ -78,15 +84,21 @@ ResultType
 VectorOperations< Devices::Host >::
 getVectorAbsMax( const Vector& v )
 {
-   typedef typename Vector::RealType RealType;
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
 
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
 
-   Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
+   /*Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
    return Reduction< Devices::Host >::reduce( operation,
                                               v.getSize(),
                                               v.getData(),
-                                              ( RealType* ) 0 );
+                                              ( RealType* ) 0 );*/
+
+   const RealType* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto max = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); };
+   return Reduction< Devices::Host >::reduce( v.getSize(), max, fetch, 0.0 );
 }
 
 
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index 68eb1f556..dcfa91a34 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#pragma once 
+#pragma once
 
 #include <cmath>
 #include <type_traits>
@@ -18,6 +18,13 @@
 
 namespace TNL {
 
+template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type >
+__cuda_callable__ inline
+ResultType sum( const T1& a, const T2& b )
+{
+   return a + b;
+}
+
 /**
  * \brief This function returns minimum of two numbers.
  *
@@ -105,7 +112,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t
 __cuda_callable__
 ResultType argMax( const T1& a, const T2& b )
 {
-   return ( a > b ) ?  a : b;   
+   return ( a > b ) ?  a : b;
 }
 
 /***
@@ -125,7 +132,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t
 __cuda_callable__
 ResultType argAbsMax( const T1& a, const T2& b )
 {
-   return ( TNL::abs( a ) > TNL::abs( b ) ) ?  a : b;   
+   return ( TNL::abs( a ) > TNL::abs( b ) ) ?  a : b;
 }
 
 /**
-- 
GitLab


From 8c1c80e2428d96fcd1259586a227688f24a74d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 13 Apr 2019 20:26:02 +0200
Subject: [PATCH 3/9] [WIP] New lambda function based reduction compiles but
 does not work in CUDA.

---
 .../Containers/Algorithms/ArrayAssignment.h   |   2 -
 .../Algorithms/ArrayOperationsCuda.hpp        |  34 +-
 .../Algorithms/CommonVectorOperations.h       |  79 +++++
 .../Algorithms/CudaReductionKernel.h          |  62 ++--
 src/TNL/Containers/Algorithms/Reduction.h     |  12 +-
 .../Containers/Algorithms/Reduction_impl.h    |  19 +-
 .../Containers/Algorithms/VectorOperations.h  |  19 +-
 .../Algorithms/VectorOperationsCuda_impl.h    |   8 +-
 .../Algorithms/VectorOperationsHost_impl.h    | 293 +-----------------
 .../Containers/ArrayOperationsTest.h          |   6 +-
 10 files changed, 172 insertions(+), 362 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.h

diff --git a/src/TNL/Containers/Algorithms/ArrayAssignment.h b/src/TNL/Containers/Algorithms/ArrayAssignment.h
index 62ab43a84..7f5b8482e 100644
--- a/src/TNL/Containers/Algorithms/ArrayAssignment.h
+++ b/src/TNL/Containers/Algorithms/ArrayAssignment.h
@@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false >
 
 };
 
-
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index ab8770013..0570f10f4 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -205,8 +205,19 @@ compareMemory( const Element1* destination,
 {
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
-   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
-   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );
+
+   Element1* d;
+   cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) );
+   auto fetch = [=] __cuda_callable__ ( Index i ) { return  d[ 0 ]; }; //( destination[ i ] == source[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
+   return Reduction< Devices::Cuda >::reduce(
+      size,
+      reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; },
+      fetch, //[=] __cuda_callable__ ( Index i ) { return  destination[ i ]; },
+      true );
+
+   /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
+   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
 }
 
 template< typename Element,
@@ -219,11 +230,11 @@ containsValue( const Element* data,
 {
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
+
    if( size == 0 ) return false;
-   //Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
-   //reductionContainsValue.setValue( value );
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); };
-   return Reduction< Devices::Cuda >::reduce( size, TNL::sum, fetch, 0.0  );
+   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; };
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
 }
 
 template< typename Element,
@@ -237,9 +248,16 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
-   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
+
+   if( size == 0 ) return false;
+   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
+
+   
+   /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
    reductionContainsOnlyValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
+   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.h b/src/TNL/Containers/Algorithms/CommonVectorOperations.h
new file mode 100644
index 000000000..bd362fbab
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.h
@@ -0,0 +1,79 @@
+/***************************************************************************
+                          CommonVectorOperations.h  -  description
+                             -------------------
+    begin                : Apr 12, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Device >
+struct CommonVectorOperations
+{
+   using DeviceType = Device;
+   
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorMax( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorMin( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorAbsMax( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorAbsMin( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorL1Norm( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorL2Norm( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar >
+   static ResultType getVectorLpNorm( const Vector& v, const Scalar p );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorSum( const Vector& v );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar >
+   static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
+
+#include <TNL/Containers/Algorithms/CommonVectorOperations.hpp>
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 0a71aaa79..685a3a332 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -40,23 +40,21 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
 #endif
 
 template< int blockSize,
-   typename Real,
+   typename Result,
    typename DataFetcher,
    typename Reduction,
-   typename Index,
-   typename Result = decltype( std::declval< DataFetcher >( 0,0 ) ) >
+   typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( const Real& zero,
+CudaReductionKernel( const Result zero,
                      const DataFetcher& dataFetcher,
                      const Reduction& reduction,
                      const Index size,
                      Result* output )
 {
-   using RealType = Real;
    using IndexType = Index;
    using ResultType = Result;
-
+   
    ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
    /***
@@ -69,11 +67,12 @@ CudaReductionKernel( const Real& zero,
    const IndexType gridSize = blockDim.x * gridDim.x;
 
    sdata[ tid ] = zero;
+
    /***
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
-   while( gid + 4 * gridSize < size )
+   /*while( gid + 4 * gridSize < size )
    {
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
@@ -86,15 +85,14 @@ CudaReductionKernel( const Real& zero,
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
-   }
+   }*/
    while( gid < size )
    {
-      sdata[ tid ] =  reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
-
-
+   return;
    //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
 
    //return;
@@ -104,19 +102,19 @@ CudaReductionKernel( const Real& zero,
    if( blockSize >= 1024 )
    {
       if( tid < 512 )
-         reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 )
    {
       if( tid < 256 )
-         reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 )
    {
       if( tid < 128 )
-         reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
       //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -124,7 +122,7 @@ CudaReductionKernel( const Real& zero,
    if( blockSize >= 128 )
    {
       if( tid <  64 )
-         reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -138,34 +136,34 @@ CudaReductionKernel( const Real& zero,
       volatile ResultType* vsdata = sdata;
       if( blockSize >= 64 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
          //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
       if( blockSize >= 32 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
          //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >= 16 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
          //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  8 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
          //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  4 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
          //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  2 )
       {
-         reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
          //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
@@ -181,8 +179,7 @@ CudaReductionKernel( const Real& zero,
 
 }
 
-template< typename Real,
-   typename DataFetcher,
+template< typename DataFetcher,
    typename Reduction,
    typename Index,
    typename Result >
@@ -190,10 +187,9 @@ int
 CudaReductionKernelLauncher( const Index size,
                              const Reduction& reduction,
                              const DataFetcher& dataFetcher,
-                             const Real& zero,
+                             const Result& zero,
                              Result*& output )
 {
-   using RealType = Real;
    using IndexType = Index;
    using ResultType = Result;
 
@@ -236,49 +232,49 @@ CudaReductionKernelLauncher( const Index size,
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 256 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 128 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  64 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  32 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  16 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   8 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   4 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
          break;
       case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Real, DataFetcher, Reduction, Index, Result >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   2 >
          <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index 6eb07f999..ece9016c5 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -30,9 +30,9 @@ class Reduction< Devices::Cuda >
 {
 public:
    template< typename Index,
+             typename Result,
              typename ReductionOperation,
-             typename DataFetcher,
-             typename Result = decltype( DataFetcher::operator() ) >
+             typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
@@ -45,9 +45,9 @@ class Reduction< Devices::Host >
 {
 public:
    template< typename Index,
+             typename Result,
              typename ReductionOperation,
-             typename DataFetcher,
-             typename Result = decltype( DataFetcher::operator() ) >
+             typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
@@ -60,9 +60,9 @@ class Reduction< Devices::MIC >
 {
 public:
    template< typename Index,
+             typename Result,
              typename ReductionOperation,
-             typename DataFetcher,
-             typename Result = decltype( DataFetcher::operator() ) >
+             typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h
index 375e6d004..b5fcabdc7 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -40,9 +40,9 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25
 
 
 template< typename Index,
+          typename Result,
           typename ReductionOperation,
-          typename DataFetcher,
-          typename Result >
+          typename DataFetcher >
 Result
 Reduction< Devices::Cuda >::
    reduce( const Index size,
@@ -109,7 +109,9 @@ Reduction< Devices::Cuda >::
       /***
        * Transfer the reduced data from device to host.
        */
-      ResultType resultArray[ reducedSize ];
+      //ResultType* resultArray[ reducedSize ];
+      //std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      ResultType* resultArray = new ResultType[ reducedSize ];
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -122,14 +124,15 @@ Reduction< Devices::Cuda >::
       /***
        * Reduce the data on the host system.
        */
-      auto copyFetch = [=] ( IndexType i ) { return resultArray[ i ]; };
-      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, copyFetch, zero );
+      auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
+      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
-
+      
+      delete[] resultArray;
       return result;
    }
    else {
@@ -170,9 +173,9 @@ Reduction< Devices::Cuda >::
 };
 
 template< typename Index,
+          typename Result,
           typename ReductionOperation,
-          typename DataFetcher,
-          typename Result >
+          typename DataFetcher >
 Result
 Reduction< Devices::Host >::
    reduce( const Index size,
diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h
index a3ef89fc3..1c89dbca0 100644
--- a/src/TNL/Containers/Algorithms/VectorOperations.h
+++ b/src/TNL/Containers/Algorithms/VectorOperations.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Algorithms/Reduction.h>
+#include <TNL/Containers/Algorithms/CommonVectorOperations.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -22,7 +23,7 @@ template< typename Device >
 class VectorOperations{};
 
 template<>
-class VectorOperations< Devices::Host >
+class VectorOperations< Devices::Host > : public CommonVectorOperations< Devices::Host >
 {
 public:
    template< typename Vector >
@@ -36,7 +37,7 @@ public:
                            const typename Vector::RealType& value,
                            const Scalar thisElementMultiplicator );
 
-   template< typename Vector, typename ResultType = typename Vector::RealType >
+   /*template< typename Vector, typename ResultType = typename Vector::RealType >
    static ResultType getVectorMax( const Vector& v );
 
    template< typename Vector, typename ResultType = typename Vector::RealType >
@@ -83,12 +84,13 @@ public:
 
    template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
    static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+    */
 
    template< typename Vector, typename Scalar >
    static void vectorScalarMultiplication( Vector& v, Scalar alpha );
 
-   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
-   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+   /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/
 
    template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
    static void addVector( Vector1& y,
@@ -116,7 +118,7 @@ public:
 };
 
 template<>
-class VectorOperations< Devices::Cuda >
+class VectorOperations< Devices::Cuda > : public CommonVectorOperations< Devices::Cuda >
 {
 public:
    template< typename Vector >
@@ -130,7 +132,7 @@ public:
                            const typename Vector::RealType& value,
                            const Scalar thisElementMultiplicator );
 
-   template< typename Vector, typename ResultType = typename Vector::RealType >
+   /*template< typename Vector, typename ResultType = typename Vector::RealType >
    static ResultType getVectorMax( const Vector& v );
 
    template< typename Vector, typename ResultType = typename Vector::RealType >
@@ -177,12 +179,13 @@ public:
 
    template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
    static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+    */
 
    template< typename Vector, typename Scalar >
    static void vectorScalarMultiplication( Vector& v, const Scalar alpha );
 
-   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
-   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+   /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/
 
    template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
    static void addVector( Vector1& y,
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
index 05f334b05..d79b5f069 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
@@ -39,7 +39,7 @@ addElement( Vector& v,
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
-template< typename Vector, typename ResultType >
+/*template< typename Vector, typename ResultType >
 ResultType
 VectorOperations< Devices::Cuda >::
 getVectorMax( const Vector& v )
@@ -314,7 +314,7 @@ getVectorDifferenceSum( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 #ifdef HAVE_CUDA
 template< typename Real, typename Index, typename Scalar >
@@ -358,7 +358,7 @@ vectorScalarMultiplication( Vector& v,
 }
 
 
-template< typename Vector1, typename Vector2, typename ResultType >
+/*template< typename Vector1, typename Vector2, typename ResultType >
 ResultType
 VectorOperations< Devices::Cuda >::
 getScalarProduct( const Vector1& v1,
@@ -372,7 +372,7 @@ getScalarProduct( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 #ifdef HAVE_CUDA
 template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 >
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
index d324806ab..7b1bba80c 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
@@ -41,295 +41,6 @@ addElement( Vector& v,
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorMax( const Vector& v )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   /*Algorithms::ParallelReductionMax< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );*/
-   const RealType* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; };
-   auto reduction = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); };
-   return Reduction< Devices::Host >::reduce( v.getSize(), reduction, fetch, 0.0 );
-
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorMin( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionMin< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorAbsMax( const Vector& v )
-{
-   using RealType = typename Vector::RealType;
-   using IndexType = typename Vector::IndexType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   /*Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );*/
-
-   const RealType* data = v.getData();
-   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   auto max = [=] __cuda_callable__ ( const RealType& a, const RealType& b ) { return TNL::max( a, b ); };
-   return Reduction< Devices::Host >::reduce( v.getSize(), max, fetch, 0.0 );
-}
-
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorAbsMin( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorL1Norm( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorL2Norm( const Vector& v )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionL2Norm< Real, ResultType > operation;
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v.getSize(),
-                                                                 v.getData(),
-                                                                 ( Real* ) 0 );
-   return std::sqrt( result );
-}
-
-template< typename Vector, typename ResultType, typename Scalar >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorLpNorm( const Vector& v,
-                 const Scalar p )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
-
-   if( p == 1.0 )
-      return getVectorL1Norm< Vector, ResultType >( v );
-   if( p == 2.0 )
-      return getVectorL2Norm< Vector, ResultType >( v );
-
-   Algorithms::ParallelReductionLpNorm< Real, ResultType, Scalar > operation;
-   operation.setPower( p );
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v.getSize(),
-                                                                 v.getData(),
-                                                                 ( Real* ) 0 );
-   return std::pow( result, 1.0 / p );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorSum( const Vector& v )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionSum< Real, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( Real* ) 0 );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceMax( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceMin( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceAbsMax( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceAbsMin( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceL1Norm( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceL2Norm( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v1.getSize(),
-                                                                 v1.getData(),
-                                                                 v2.getData() );
-   return std::sqrt( result );
-}
-
-
-template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceLpNorm( const Vector1& v1,
-                           const Vector2& v2,
-                           const Scalar p )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
-
-   if( p == 1.0 )
-      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
-   if( p == 2.0 )
-      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
-
-   Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Scalar > operation;
-   operation.setPower( p );
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v1.getSize(),
-                                                                 v1.getData(),
-                                                                 v2.getData() );
-   return std::pow( result, 1.0 / p );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceSum( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
 
 
 template< typename Vector, typename Scalar >
@@ -351,7 +62,7 @@ vectorScalarMultiplication( Vector& v,
 }
 
 
-template< typename Vector1, typename Vector2, typename ResultType >
+/*template< typename Vector1, typename Vector2, typename ResultType >
 ResultType
 VectorOperations< Devices::Host >::
 getScalarProduct( const Vector1& v1,
@@ -365,7 +76,7 @@ getScalarProduct( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
 void
diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h
index 23b8fcd4e..9a0d24039 100644
--- a/src/UnitTests/Containers/ArrayOperationsTest.h
+++ b/src/UnitTests/Containers/ArrayOperationsTest.h
@@ -303,9 +303,10 @@ TYPED_TEST( ArrayOperationsTest, copyMemoryWithConversions_cuda )
    ArrayOperations< Devices::Cuda >::freeMemory( deviceData2 );
 }
 
-TYPED_TEST( ArrayOperationsTest, compareMemory_cuda )
+//TYPED_TEST( ArrayOperationsTest, compareMemory_cuda )
+void Test()
 {
-   using ValueType = typename TestFixture::ValueType;
+   using ValueType = double;//typename TestFixture::ValueType;
    const int size = ARRAY_TEST_SIZE;
 
    ValueType *hostData, *deviceData, *deviceData2;
@@ -445,6 +446,7 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda )
 #include "../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
+   Test();
 #ifdef HAVE_GTEST
    ::testing::InitGoogleTest( &argc, argv );
    return RUN_ALL_TESTS();
-- 
GitLab


From ec25e9a1f58b3ff1c4feed3ccdd15d0a5b489b2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sat, 13 Apr 2019 22:01:49 +0200
Subject: [PATCH 4/9] [WIP] Deleted debugging code.

---
 .../Algorithms/ArrayOperationsCuda.hpp        |  5 +----
 .../Algorithms/CudaReductionKernel.h          |  8 +++----
 .../Containers/Algorithms/Reduction_impl.h    |  8 +++----
 .../Containers/ArrayOperationsTest.h          | 22 +++++++++++++++----
 4 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 0570f10f4..472eb414a 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -193,7 +193,6 @@ copySTLList( DestinationElement* destination,
       copiedElements += copySize;
    }
 }
-
 template< typename Element1,
           typename Element2,
           typename Index >
@@ -206,9 +205,7 @@ compareMemory( const Element1* destination,
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
-   Element1* d;
-   cudaMalloc( ( void** ) &d, size * sizeof( Element1 ) );
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  d[ 0 ]; }; //( destination[ i ] == source[ i ] ); };
+   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( destination[ i ] == source[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
    return Reduction< Devices::Cuda >::reduce(
       size,
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 685a3a332..21331defe 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -72,7 +72,7 @@ CudaReductionKernel( const Result zero,
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
-   /*while( gid + 4 * gridSize < size )
+   while( gid + 4 * gridSize < size )
    {
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
@@ -85,14 +85,14 @@ CudaReductionKernel( const Result zero,
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
-   }*/
+   }
    while( gid < size )
    {
-      sdata[ tid ] = dataFetcher( gid ); //reduction( sdata[ tid ], dataFetcher( gid ) );
+      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
-   return;
+
    //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
 
    //return;
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction_impl.h
index b5fcabdc7..0fee746cf 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction_impl.h
@@ -110,9 +110,9 @@ Reduction< Devices::Cuda >::
        * Transfer the reduced data from device to host.
        */
       //ResultType* resultArray[ reducedSize ];
-      //std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
-      ResultType* resultArray = new ResultType[ reducedSize ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      //ResultType* resultArray = new ResultType[ reducedSize ];
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -132,7 +132,7 @@ Reduction< Devices::Cuda >::
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
       
-      delete[] resultArray;
+      //delete[] resultArray;
       return result;
    }
    else {
diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h
index 9a0d24039..21bfccdea 100644
--- a/src/UnitTests/Containers/ArrayOperationsTest.h
+++ b/src/UnitTests/Containers/ArrayOperationsTest.h
@@ -303,10 +303,9 @@ TYPED_TEST( ArrayOperationsTest, copyMemoryWithConversions_cuda )
    ArrayOperations< Devices::Cuda >::freeMemory( deviceData2 );
 }
 
-//TYPED_TEST( ArrayOperationsTest, compareMemory_cuda )
-void Test()
+TYPED_TEST( ArrayOperationsTest, compareMemory_cuda )
 {
-   using ValueType = double;//typename TestFixture::ValueType;
+   using ValueType = typename TestFixture::ValueType;
    const int size = ARRAY_TEST_SIZE;
 
    ValueType *hostData, *deviceData, *deviceData2;
@@ -446,7 +445,22 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda )
 #include "../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-   Test();
+
+   using ValueType = double;
+   int size = 1000;
+   ValueType *hostData, *deviceData, *deviceData2;
+   ArrayOperations< Devices::Host >::allocateMemory( hostData, size );
+   ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size );
+   ArrayOperations< Devices::Cuda >::allocateMemory( deviceData2, size );
+
+   ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 7, size );
+   ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 8, size );
+   ArrayOperations< Devices::Cuda >::setMemory( deviceData2, (ValueType) 9, size );
+   EXPECT_FALSE(( ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< ValueType, ValueType >( hostData, deviceData, size ) ));
+   EXPECT_FALSE(( ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< ValueType, ValueType >( deviceData, hostData, size ) ));
+   EXPECT_FALSE(( ArrayOperations< Devices::Cuda >::compareMemory< ValueType, ValueType >( deviceData, deviceData2, size ) ));
+
+   return 0;
 #ifdef HAVE_GTEST
    ::testing::InitGoogleTest( &argc, argv );
    return RUN_ALL_TESTS();
-- 
GitLab


From 0ef6688575b2474cda5bec9349810a34390572fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 14 Apr 2019 10:36:43 +0200
Subject: [PATCH 5/9] Parallel reduction is working.

---
 .../Algorithms/ArrayOperationsCuda.hpp           |  6 +-----
 .../Containers/Algorithms/CudaReductionKernel.h  |  7 ++-----
 src/UnitTests/Containers/ArrayOperationsTest.h   | 16 ----------------
 3 files changed, 3 insertions(+), 26 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 472eb414a..a12b9c67f 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -207,11 +207,7 @@ compareMemory( const Element1* destination,
 
    auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( destination[ i ] == source[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce(
-      size,
-      reduction, //[=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; },
-      fetch, //[=] __cuda_callable__ ( Index i ) { return  destination[ i ]; },
-      true );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
 
    /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
    return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 21331defe..8fea90f8f 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -47,8 +47,8 @@ template< int blockSize,
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
-                     const DataFetcher& dataFetcher,
-                     const Reduction& reduction,
+                     const DataFetcher dataFetcher,
+                     const Reduction reduction,
                      const Index size,
                      Result* output )
 {
@@ -94,8 +94,6 @@ CudaReductionKernel( const Result zero,
    __syncthreads();
 
    //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   //return;
    /***
     *  Perform the parallel reduction.
     */
@@ -127,7 +125,6 @@ CudaReductionKernel( const Result zero,
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
 
-
    /***
     * This runs in one warp so it is synchronized implicitly.
     */
diff --git a/src/UnitTests/Containers/ArrayOperationsTest.h b/src/UnitTests/Containers/ArrayOperationsTest.h
index 21bfccdea..23b8fcd4e 100644
--- a/src/UnitTests/Containers/ArrayOperationsTest.h
+++ b/src/UnitTests/Containers/ArrayOperationsTest.h
@@ -445,22 +445,6 @@ TYPED_TEST( ArrayOperationsTest, containsOnlyValue_cuda )
 #include "../GtestMissingError.h"
 int main( int argc, char* argv[] )
 {
-
-   using ValueType = double;
-   int size = 1000;
-   ValueType *hostData, *deviceData, *deviceData2;
-   ArrayOperations< Devices::Host >::allocateMemory( hostData, size );
-   ArrayOperations< Devices::Cuda >::allocateMemory( deviceData, size );
-   ArrayOperations< Devices::Cuda >::allocateMemory( deviceData2, size );
-
-   ArrayOperations< Devices::Host >::setMemory( hostData, (ValueType) 7, size );
-   ArrayOperations< Devices::Cuda >::setMemory( deviceData, (ValueType) 8, size );
-   ArrayOperations< Devices::Cuda >::setMemory( deviceData2, (ValueType) 9, size );
-   EXPECT_FALSE(( ArrayOperations< Devices::Host, Devices::Cuda >::compareMemory< ValueType, ValueType >( hostData, deviceData, size ) ));
-   EXPECT_FALSE(( ArrayOperations< Devices::Cuda, Devices::Host >::compareMemory< ValueType, ValueType >( deviceData, hostData, size ) ));
-   EXPECT_FALSE(( ArrayOperations< Devices::Cuda >::compareMemory< ValueType, ValueType >( deviceData, deviceData2, size ) ));
-
-   return 0;
 #ifdef HAVE_GTEST
    ::testing::InitGoogleTest( &argc, argv );
    return RUN_ALL_TESTS();
-- 
GitLab


From 4b52cc2bcfdf4ae836f6b831fb1ebb3ddbc44e38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 14 Apr 2019 22:43:35 +0200
Subject: [PATCH 6/9] Rewritting lambdas with references.

---
 .../Algorithms/ArrayOperationsCuda.hpp        |  29 +-
 .../Algorithms/CommonVectorOperations.hpp     | 375 ++++++++++++++++++
 .../Algorithms/CudaReductionKernel.h          |  72 ++--
 src/TNL/Containers/Algorithms/Reduction.h     |   8 +-
 .../{Reduction_impl.h => Reduction.hpp}       |  78 ++--
 5 files changed, 465 insertions(+), 97 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
 rename src/TNL/Containers/Algorithms/{Reduction_impl.h => Reduction.hpp} (74%)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index a12b9c67f..bede985bc 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -205,12 +205,10 @@ compareMemory( const Element1* destination,
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( destination[ i ] == source[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
-
-   /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
-   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 template< typename Element,
@@ -225,9 +223,10 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
 
    if( size == 0 ) return false;
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
 }
 
 template< typename Element,
@@ -243,14 +242,10 @@ containsOnlyValue( const Element* data,
    if( size == 0 ) return false;
 
    if( size == 0 ) return false;
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
-
-   
-   /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
-   reductionContainsOnlyValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
new file mode 100644
index 000000000..1dc43b2c7
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
@@ -0,0 +1,375 @@
+/***************************************************************************
+                          CommonVectorOperations.hpp  -  description
+                             -------------------
+    begin                : Apr 12, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/CommonVectorOperations.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL1Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL2Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorLpNorm( const Vector& v,
+                 const Scalar p )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   if( p == 1.0 )
+      return getVectorL1Norm< Vector, ResultType >( v );
+   if( p == 2.0 )
+      return getVectorL2Norm< Vector, ResultType >( v );
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorSum( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   if( std::is_same< ResultType, bool >::value )
+      abort();
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMax( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMin( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMax( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMin( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL1Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL2Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) {
+      auto diff = data1[ i ] - data2[ i ];
+      return diff * diff;
+   };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceLpNorm( const Vector1& v1,
+                           const Vector2& v2,
+                           const Scalar p )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   if( p == 1.0 )
+      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
+   if( p == 2.0 )
+      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceSum( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getScalarProduct( const Vector1& v1,
+                  const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 8fea90f8f..a90e0a3aa 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -43,12 +43,14 @@ template< int blockSize,
    typename Result,
    typename DataFetcher,
    typename Reduction,
+   typename VolatileReduction,
    typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
                      const DataFetcher dataFetcher,
                      const Reduction reduction,
+                     const VolatileReduction volatileReduction,
                      const Index size,
                      Result* output )
 {
@@ -74,21 +76,21 @@ CudaReductionKernel( const Result zero,
     */
    while( gid + 4 * gridSize < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
@@ -100,19 +102,19 @@ CudaReductionKernel( const Result zero,
    if( blockSize >= 1024 )
    {
       if( tid < 512 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 )
    {
       if( tid < 256 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 )
    {
       if( tid < 128 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
       //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -120,7 +122,7 @@ CudaReductionKernel( const Result zero,
    if( blockSize >= 128 )
    {
       if( tid <  64 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -133,34 +135,34 @@ CudaReductionKernel( const Result zero,
       volatile ResultType* vsdata = sdata;
       if( blockSize >= 64 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
          //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
       if( blockSize >= 32 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
          //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >= 16 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
          //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  8 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
          //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  4 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
          //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  2 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
          //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
@@ -178,11 +180,13 @@ CudaReductionKernel( const Result zero,
 
 template< typename DataFetcher,
    typename Reduction,
+   typename VolatileReduction,
    typename Index,
    typename Result >
 int
 CudaReductionKernelLauncher( const Index size,
                              const Reduction& reduction,
+                             const VolatileReduction& volatileReduction,
                              const DataFetcher& dataFetcher,
                              const Result& zero,
                              Result*& output )
@@ -226,55 +230,55 @@ CudaReductionKernelLauncher( const Index size,
    {
       case 512:
          CudaReductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   1:
          TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index ece9016c5..7ac0d4c02 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -32,10 +32,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -47,10 +49,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -62,10 +66,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -74,4 +80,4 @@ public:
 } // namespace Containers
 } // namespace TNL
 
-#include "Reduction_impl.h"
+#include "Reduction.hpp"
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp
similarity index 74%
rename from src/TNL/Containers/Algorithms/Reduction_impl.h
rename to src/TNL/Containers/Algorithms/Reduction.hpp
index 0fee746cf..22121de25 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -42,11 +42,13 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25
 template< typename Index,
           typename Result,
           typename ReductionOperation,
+          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Cuda >::
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero )
 {
@@ -94,10 +96,11 @@ Reduction< Devices::Cuda >::
     */
    ResultType* deviceAux1( 0 );
    IndexType reducedSize = CudaReductionKernelLauncher( size,
-                                                        reduction,
-                                                        dataFetcher,
-                                                        zero,
-                                                        deviceAux1 );
+      reduction,
+      volatileReduction,
+      dataFetcher,
+      zero,
+      deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -111,7 +114,6 @@ Reduction< Devices::Cuda >::
        */
       //ResultType* resultArray[ reducedSize ];
       std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
-      //ResultType* resultArray = new ResultType[ reducedSize ];
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -125,28 +127,26 @@ Reduction< Devices::Cuda >::
        * Reduce the data on the host system.
        */
       auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
+      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
-      
-      //delete[] resultArray;
       return result;
    }
    else {
       /***
        * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
        */
-      //LaterReductionOperation laterReductionOperation;
       auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
       while( reducedSize > 1 ) {
          reducedSize = CudaReductionKernelLauncher( reducedSize,
-                                                    reduction,
-                                                    copyFetch,
-                                                    zero,
-                                                    deviceAux1 );
+            reduction,
+            volatileReduction,
+            copyFetch,
+            zero,
+            deviceAux1 );
       }
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -175,11 +175,13 @@ Reduction< Devices::Cuda >::
 template< typename Index,
           typename Result,
           typename ReductionOperation,
+          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Host >::
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero )
 {
@@ -202,14 +204,10 @@ Reduction< Devices::Host >::
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
-               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
-               /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
-               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
@@ -217,19 +215,18 @@ Reduction< Devices::Host >::
          #pragma omp single nowait
          {
             for( IndexType i = blocks * block_size; i < size; i++ )
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
-               //operation.dataFetcher( r[ 0 ], i, input1, input2 );
+               reduction( r[ 0 ], dataFetcher( i ) );
          }
 
          // local reduction of unrolled results
-         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
-         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
-         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            result = reduction( result, r[ 0 ] );
+            reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -244,37 +241,28 @@ Reduction< Devices::Host >::
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
-               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
-               /*operation.dataFetcher( r[ 0 ], offset + i,     input1, input2 );
-               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
          for( IndexType i = blocks * block_size; i < size; i++ )
-            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
+            reduction( r[ 0 ], dataFetcher( i ) );
             //operation.dataFetcher( r[ 0 ], i, input1, input2 );
 
          // reduction of unrolled results
-         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
-         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
-         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
-
-         /*operation.reduction( r[ 0 ], r[ 2 ] );
-         operation.reduction( r[ 1 ], r[ 3 ] );
-         operation.reduction( r[ 0 ], r[ 1 ] );*/
-
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
          return r[ 0 ];
       }
       else {
          ResultType result = zero;
          for( IndexType i = 0; i < size; i++ )
-            result = reduction( result, dataFetcher( i ) );
+            reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
-- 
GitLab


From aa18c0a2925e30fa37f7455351bce217a7faa944 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 14 Apr 2019 23:01:09 +0200
Subject: [PATCH 7/9] Changing numeric limits -max to lowest.

---
 src/TNL/Containers/Algorithms/CommonVectorOperations.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
index 1dc43b2c7..5430f6eac 100644
--- a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
@@ -32,7 +32,7 @@ getVectorMax( const Vector& v )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
    auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -68,7 +68,7 @@ getVectorAbsMax( const Vector& v )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
    auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -189,7 +189,7 @@ getVectorDifferenceMax( const Vector1& v1,
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
    auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -231,7 +231,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
    auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
-   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
-- 
GitLab


From 58af5093d15740f6bf3e3cb2c9715161654f64c8 Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Mon, 15 Apr 2019 13:40:51 +0200
Subject: [PATCH 8/9] Fixing CUDA array operations.

---
 src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index bede985bc..0badc8d79 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -208,7 +208,7 @@ compareMemory( const Element1* destination,
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
    auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 template< typename Element,
@@ -226,7 +226,7 @@ containsValue( const Element* data,
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
    auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
-   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
 }
 
 template< typename Element,
@@ -245,7 +245,7 @@ containsOnlyValue( const Element* data,
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
    auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
    auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
-   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 
-- 
GitLab


From 3f891223ca0065800b1952252d2f179f3853310b Mon Sep 17 00:00:00 2001
From: Tomas Oberhuber <tomas.oberhuber@fjfi.cvut.cz>
Date: Mon, 15 Apr 2019 16:04:24 +0200
Subject: [PATCH 9/9] Fixed use of buffer in CUDA paralell reduction.

---
 .../Algorithms/CommonVectorOperations.hpp     |   4 +-
 .../Algorithms/CudaReductionKernel.h          | 250 +++++++++++-------
 src/TNL/Containers/Algorithms/Reduction.hpp   |  25 +-
 3 files changed, 165 insertions(+), 114 deletions(-)

diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
index 5430f6eac..ef3317cb7 100644
--- a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
@@ -24,7 +24,7 @@ CommonVectorOperations< Device >::
 getVectorMax( const Vector& v )
 {
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-   
+
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
@@ -42,7 +42,7 @@ CommonVectorOperations< Device >::
 getVectorMin( const Vector& v )
 {
    TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-   
+
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index a90e0a3aa..8701b8b7d 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -56,7 +56,7 @@ CudaReductionKernel( const Result zero,
 {
    using IndexType = Index;
    using ResultType = Result;
-   
+
    ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
    /***
@@ -178,22 +178,14 @@ CudaReductionKernel( const Result zero,
 
 }
 
-template< typename DataFetcher,
-   typename Reduction,
-   typename VolatileReduction,
-   typename Index,
-   typename Result >
-int
-CudaReductionKernelLauncher( const Index size,
-                             const Reduction& reduction,
-                             const VolatileReduction& volatileReduction,
-                             const DataFetcher& dataFetcher,
-                             const Result& zero,
-                             Result*& output )
+template< typename Index,
+          typename Result >
+struct CudaReductionKernelLauncher
 {
    using IndexType = Index;
    using ResultType = Result;
 
+   ////
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -203,93 +195,159 @@ CudaReductionKernelLauncher( const Index size,
    // where blocksPerMultiprocessor is determined according to the number of
    // available registers on the multiprocessor.
    // On Tesla K40c, desGridSize = 8 * 15 = 120.
-   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
-   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
-                                      / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
-   const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
-   dim3 blockSize, gridSize;
-   blockSize.x = Reduction_maxThreadsPerBlock;
-   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
-
-   // create reference to the reduction buffer singleton and set size
-   const size_t buf_size = desGridSize * sizeof( ResultType );
-   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-   cudaReductionBuffer.setSize( buf_size );
-   output = cudaReductionBuffer.template getData< ResultType >();
-
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   const IndexType shmem = (blockSize.x <= 32)
-            ? 2 * blockSize.x * sizeof( ResultType )
-            : blockSize.x * sizeof( ResultType );
+   CudaReductionKernelLauncher( const Index size )
+   : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
+     blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
+                               / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
+     desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     originalSize( size )
+   {
+   }
 
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
-   switch( blockSize.x )
+   template< typename DataFetcher,
+             typename Reduction,
+             typename VolatileReduction >
+   int start( const Reduction& reduction,
+              const VolatileReduction& volatileReduction,
+              const DataFetcher& dataFetcher,
+              const Result& zero,
+              ResultType*& output )
    {
-      case 512:
-         CudaReductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-     case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
-         break;
-      case   1:
-         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-      default:
-         TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+      ////
+      // create reference to the reduction buffer singleton and set size
+      const size_t buf_size = 2 * desGridSize * sizeof( ResultType );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      cudaReductionBuffer.setSize( buf_size );
+      output = cudaReductionBuffer.template getData< ResultType >();
+
+      this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
+      return this->reducedSize;
    }
-   TNL_CHECK_CUDA_DEVICE;
 
-   // return the size of the output array on the CUDA device
-   return gridSize.x;
-}
+   template< typename Reduction,
+             typename VolatileReduction >
+   Result finish( const Reduction& reduction,
+                  const VolatileReduction& volatileReduction,
+                  const Result& zero )
+   {
+      ////
+      // Input is the first half of the buffer, output is the second half
+      const size_t buf_size = desGridSize * sizeof( ResultType );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
+      ResultType* output = &input[ buf_size ];
+
+      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+      while( this->reducedSize > 1 )
+      {
+         this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
+         std::swap( input, output );
+      }
+
+      ////
+      // Copy result on CPU
+      ResultType result;
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 );
+      return result;
+   }
+
+   protected:
+      template< typename DataFetcher,
+                typename Reduction,
+                typename VolatileReduction >
+      int launch( const Index size,
+                  const Reduction& reduction,
+                  const VolatileReduction& volatileReduction,
+                  const DataFetcher& dataFetcher,
+                  const Result& zero,
+                  Result* output )
+      {
+         dim3 blockSize, gridSize;
+         blockSize.x = Reduction_maxThreadsPerBlock;
+         gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+
+         ////
+         // when there is only one warp per blockSize.x, we need to allocate two warps
+         // worth of shared memory so that we don't index shared memory out of bounds
+         const IndexType shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * sizeof( ResultType )
+                  : blockSize.x * sizeof( ResultType );
+
+        /***
+         * Depending on the blockSize we generate appropriate template instance.
+         */
+        switch( blockSize.x )
+        {
+           case 512:
+              CudaReductionKernel< 512 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 256:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 256 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 128:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 128 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  64:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  64 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  32:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  32 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  16:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  16 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+          case   8:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   8 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   4:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   4 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   2:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   2 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   1:
+              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+           default:
+              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+        }
+        TNL_CHECK_CUDA_DEVICE;
+
+        ////
+        // return the size of the output array on the CUDA device
+        return gridSize.x;
+      }
+
+      const int activeDevice;
+      const int blocksdPerMultiprocessor;
+      const int desGridSize;
+      const IndexType originalSize;
+      IndexType reducedSize;
+};
 #endif
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp
index 22121de25..31c93504c 100644
--- a/src/TNL/Containers/Algorithms/Reduction.hpp
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -91,11 +91,13 @@ Reduction< Devices::Cuda >::
       timer.start();
    #endif
 
+   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+
    /****
     * Reduce the data on the CUDA device.
     */
    ResultType* deviceAux1( 0 );
-   IndexType reducedSize = CudaReductionKernelLauncher( size,
+   IndexType reducedSize = reductionLauncher.start( 
       reduction,
       volatileReduction,
       dataFetcher,
@@ -112,7 +114,6 @@ Reduction< Devices::Cuda >::
       /***
        * Transfer the reduced data from device to host.
        */
-      //ResultType* resultArray[ reducedSize ];
       std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
 
@@ -139,15 +140,7 @@ Reduction< Devices::Cuda >::
       /***
        * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
        */
-      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
-      while( reducedSize > 1 ) {
-         reducedSize = CudaReductionKernelLauncher( reducedSize,
-            reduction,
-            volatileReduction,
-            copyFetch,
-            zero,
-            deviceAux1 );
-      }
+      auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -156,14 +149,14 @@ Reduction< Devices::Cuda >::
          timer.start();
       #endif
 
-      ResultType resultArray[ 1 ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
-      const ResultType result = resultArray[ 0 ];
+      //ResultType resultArray[ 1 ];
+      //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+      //const ResultType result = resultArray[ 0 ];
 
-      #ifdef CUDA_REDUCTION_PROFILING
+      /*#ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
-      #endif
+      #endif*/
 
       return result;
    }
-- 
GitLab