From 4b52cc2bcfdf4ae836f6b831fb1ebb3ddbc44e38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Sun, 14 Apr 2019 22:43:35 +0200
Subject: [PATCH] Rewritting lambdas with references.

---
 .../Algorithms/ArrayOperationsCuda.hpp        |  29 +-
 .../Algorithms/CommonVectorOperations.hpp     | 375 ++++++++++++++++++
 .../Algorithms/CudaReductionKernel.h          |  72 ++--
 src/TNL/Containers/Algorithms/Reduction.h     |   8 +-
 .../{Reduction_impl.h => Reduction.hpp}       |  78 ++--
 5 files changed, 465 insertions(+), 97 deletions(-)
 create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
 rename src/TNL/Containers/Algorithms/{Reduction_impl.h => Reduction.hpp} (74%)

diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index a12b9c67fb..bede985bc5 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -205,12 +205,10 @@ compareMemory( const Element1* destination,
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( destination[ i ] == source[ i ] ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
-
-   /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
-   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 template< typename Element,
@@ -225,9 +223,10 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
 
    if( size == 0 ) return false;
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
 }
 
 template< typename Element,
@@ -243,14 +242,10 @@ containsOnlyValue( const Element* data,
    if( size == 0 ) return false;
 
    if( size == 0 ) return false;
-   auto fetch = [=] __cuda_callable__ ( Index i ) { return  ( data[ i ] == value ); };
-   auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; };
-   return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true );
-
-   
-   /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
-   reductionContainsOnlyValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
new file mode 100644
index 0000000000..1dc43b2c72
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
@@ -0,0 +1,375 @@
+/***************************************************************************
+                          CommonVectorOperations.hpp  -  description
+                             -------------------
+    begin                : Apr 12, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/CommonVectorOperations.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL1Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL2Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorLpNorm( const Vector& v,
+                 const Scalar p )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   if( p == 1.0 )
+      return getVectorL1Norm< Vector, ResultType >( v );
+   if( p == 2.0 )
+      return getVectorL2Norm< Vector, ResultType >( v );
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorSum( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   if( std::is_same< ResultType, bool >::value )
+      abort();
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMax( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMin( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMax( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMin( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL1Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL2Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) {
+      auto diff = data1[ i ] - data2[ i ];
+      return diff * diff;
+   };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceLpNorm( const Vector1& v1,
+                           const Vector2& v2,
+                           const Scalar p )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   if( p == 1.0 )
+      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
+   if( p == 2.0 )
+      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceSum( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getScalarProduct( const Vector1& v1,
+                  const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index 8fea90f8f4..a90e0a3aab 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -43,12 +43,14 @@ template< int blockSize,
    typename Result,
    typename DataFetcher,
    typename Reduction,
+   typename VolatileReduction,
    typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
 CudaReductionKernel( const Result zero,
                      const DataFetcher dataFetcher,
                      const Reduction reduction,
+                     const VolatileReduction volatileReduction,
                      const Index size,
                      Result* output )
 {
@@ -74,21 +76,21 @@ CudaReductionKernel( const Result zero,
     */
    while( gid + 4 * gridSize < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size )
    {
-      sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
@@ -100,19 +102,19 @@ CudaReductionKernel( const Result zero,
    if( blockSize >= 1024 )
    {
       if( tid < 512 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] );
+         reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 )
    {
       if( tid < 256 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] );
+         reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 )
    {
       if( tid < 128 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] );
+         reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
       //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -120,7 +122,7 @@ CudaReductionKernel( const Result zero,
    if( blockSize >= 128 )
    {
       if( tid <  64 )
-         sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] );
+         reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -133,34 +135,34 @@ CudaReductionKernel( const Result zero,
       volatile ResultType* vsdata = sdata;
       if( blockSize >= 64 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
          //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
       if( blockSize >= 32 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
          //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >= 16 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
          //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  8 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
          //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  4 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
          //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  2 )
       {
-         vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
          //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
@@ -178,11 +180,13 @@ CudaReductionKernel( const Result zero,
 
 template< typename DataFetcher,
    typename Reduction,
+   typename VolatileReduction,
    typename Index,
    typename Result >
 int
 CudaReductionKernelLauncher( const Index size,
                              const Reduction& reduction,
+                             const VolatileReduction& volatileReduction,
                              const DataFetcher& dataFetcher,
                              const Result& zero,
                              Result*& output )
@@ -226,55 +230,55 @@ CudaReductionKernelLauncher( const Index size,
    {
       case 512:
          CudaReductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
      case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared);
+         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
 
          CudaReductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output);
+         <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
          break;
       case   1:
          TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index ece9016c59..7ac0d4c028 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -32,10 +32,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -47,10 +49,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -62,10 +66,12 @@ public:
    template< typename Index,
              typename Result,
              typename ReductionOperation,
+             typename VolatileReductionOperation,
              typename DataFetcher >
    static Result
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero );
 };
@@ -74,4 +80,4 @@ public:
 } // namespace Containers
 } // namespace TNL
 
-#include "Reduction_impl.h"
+#include "Reduction.hpp"
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp
similarity index 74%
rename from src/TNL/Containers/Algorithms/Reduction_impl.h
rename to src/TNL/Containers/Algorithms/Reduction.hpp
index 0fee746cf0..22121de25f 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -42,11 +42,13 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25
 template< typename Index,
           typename Result,
           typename ReductionOperation,
+          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Cuda >::
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero )
 {
@@ -94,10 +96,11 @@ Reduction< Devices::Cuda >::
     */
    ResultType* deviceAux1( 0 );
    IndexType reducedSize = CudaReductionKernelLauncher( size,
-                                                        reduction,
-                                                        dataFetcher,
-                                                        zero,
-                                                        deviceAux1 );
+      reduction,
+      volatileReduction,
+      dataFetcher,
+      zero,
+      deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -111,7 +114,6 @@ Reduction< Devices::Cuda >::
        */
       //ResultType* resultArray[ reducedSize ];
       std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
-      //ResultType* resultArray = new ResultType[ reducedSize ];
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -125,28 +127,26 @@ Reduction< Devices::Cuda >::
        * Reduce the data on the host system.
        */
       auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
-      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
+      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
-      
-      //delete[] resultArray;
       return result;
    }
    else {
       /***
        * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
        */
-      //LaterReductionOperation laterReductionOperation;
       auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; };
       while( reducedSize > 1 ) {
          reducedSize = CudaReductionKernelLauncher( reducedSize,
-                                                    reduction,
-                                                    copyFetch,
-                                                    zero,
-                                                    deviceAux1 );
+            reduction,
+            volatileReduction,
+            copyFetch,
+            zero,
+            deviceAux1 );
       }
 
       #ifdef CUDA_REDUCTION_PROFILING
@@ -175,11 +175,13 @@ Reduction< Devices::Cuda >::
 template< typename Index,
           typename Result,
           typename ReductionOperation,
+          typename VolatileReductionOperation,
           typename DataFetcher >
 Result
 Reduction< Devices::Host >::
    reduce( const Index size,
            ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
            DataFetcher& dataFetcher,
            const Result& zero )
 {
@@ -202,14 +204,10 @@ Reduction< Devices::Host >::
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
-               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
-               /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 );
-               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
@@ -217,19 +215,18 @@ Reduction< Devices::Host >::
          #pragma omp single nowait
          {
             for( IndexType i = blocks * block_size; i < size; i++ )
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
-               //operation.dataFetcher( r[ 0 ], i, input1, input2 );
+               reduction( r[ 0 ], dataFetcher( i ) );
          }
 
          // local reduction of unrolled results
-         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
-         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
-         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            result = reduction( result, r[ 0 ] );
+            reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -244,37 +241,28 @@ Reduction< Devices::Host >::
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
-               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
-               /*operation.dataFetcher( r[ 0 ], offset + i,     input1, input2 );
-               operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
          for( IndexType i = blocks * block_size; i < size; i++ )
-            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
+            reduction( r[ 0 ], dataFetcher( i ) );
             //operation.dataFetcher( r[ 0 ], i, input1, input2 );
 
          // reduction of unrolled results
-         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
-         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
-         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
-
-         /*operation.reduction( r[ 0 ], r[ 2 ] );
-         operation.reduction( r[ 1 ], r[ 3 ] );
-         operation.reduction( r[ 0 ], r[ 1 ] );*/
-
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
          return r[ 0 ];
       }
       else {
          ResultType result = zero;
          for( IndexType i = 0; i < size; i++ )
-            result = reduction( result, dataFetcher( i ) );
+            reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
-- 
GitLab