diff --git a/src/TNL/Containers/Algorithms/ArrayAssignment.h b/src/TNL/Containers/Algorithms/ArrayAssignment.h
index 62ab43a84aa4046ee195439bd3102d1be579f4f7..7f5b8482ee0bb570fa288e8ba7209e12ee90938a 100644
--- a/src/TNL/Containers/Algorithms/ArrayAssignment.h
+++ b/src/TNL/Containers/Algorithms/ArrayAssignment.h
@@ -86,8 +86,6 @@ struct ArrayAssignment< Array, T, false >
 
 };
 
-
-
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
index 3efe7e4d472bb5d9766faee73946941a480374e4..0badc8d799c5d5e81a10393c1ceb987ab3fc8e1e 100644
--- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
+++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp
@@ -193,7 +193,6 @@ copySTLList( DestinationElement* destination,
       copiedElements += copySize;
    }
 }
-
 template< typename Element1,
           typename Element2,
           typename Index >
@@ -205,8 +204,11 @@ compareMemory( const Element1* destination,
 {
    TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." );
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
-   Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities;
-   return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );
+
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( destination[ i ] == source[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 template< typename Element,
@@ -219,10 +221,12 @@ containsValue( const Element* data,
 {
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
+
    if( size == 0 ) return false;
-   Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue;
-   reductionContainsValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr );
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; };
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false );
 }
 
 template< typename Element,
@@ -236,9 +240,12 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." );
    TNL_ASSERT_GE( size, 0, "" );
    if( size == 0 ) return false;
-   Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue;
-   reductionContainsOnlyValue.setValue( value );
-   return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );
+
+   if( size == 0 ) return false;
+   auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return  ( data[ i ] == value ); };
+   auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; };
+   return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true );
 }
 
 
diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.h b/src/TNL/Containers/Algorithms/CommonVectorOperations.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd362fbabba8c93d64857dbc394860f6b23928f9
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.h
@@ -0,0 +1,79 @@
+/***************************************************************************
+                          CommonVectorOperations.h  -  description
+                             -------------------
+    begin                : Apr 12, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Device >
+struct CommonVectorOperations
+{
+   using DeviceType = Device;
+   
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorMax( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorMin( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorAbsMax( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorAbsMin( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorL1Norm( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorL2Norm( const Vector& v );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType, typename Scalar >
+   static ResultType getVectorLpNorm( const Vector& v, const Scalar p );
+
+   template< typename Vector, typename ResultType = typename Vector::RealType >
+   static ResultType getVectorSum( const Vector& v );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceMax( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceMin( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceAbsMax( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceAbsMin( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceL1Norm( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceL2Norm( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType, typename Scalar >
+   static ResultType getVectorDifferenceLpNorm( const Vector1& v1, const Vector2& v2, const Scalar p );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+
+   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+
+};
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
+
+#include <TNL/Containers/Algorithms/CommonVectorOperations.hpp>
diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef3317cb7a0636e3dc5ef32a1798addfdf90b6cb
--- /dev/null
+++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp
@@ -0,0 +1,375 @@
+/***************************************************************************
+                          CommonVectorOperations.hpp  -  description
+                             -------------------
+    begin                : Apr 12, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Containers/Algorithms/CommonVectorOperations.h>
+#include <TNL/Containers/Algorithms/Reduction.h>
+
+namespace TNL {
+namespace Containers {
+namespace Algorithms {
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a =  TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a =  TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMax( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorAbsMin( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL1Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorL2Norm( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data[ i ] * data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorLpNorm( const Vector& v,
+                 const Scalar p )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   if( p == 1.0 )
+      return getVectorL1Norm< Vector, ResultType >( v );
+   if( p == 2.0 )
+      return getVectorL2Norm< Vector, ResultType >( v );
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorSum( const Vector& v )
+{
+   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
+
+   if( std::is_same< ResultType, bool >::value )
+      abort();
+
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   const auto* data = v.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return  data[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMax( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceMin( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMax( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceAbsMin( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL1Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::abs( data1[ i ] - data2[ i ] ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceL2Norm( const Vector1& v1,
+                           const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) {
+      auto diff = data1[ i ] - data2[ i ];
+      return diff * diff;
+   };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceLpNorm( const Vector1& v1,
+                           const Vector2& v2,
+                           const Scalar p )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
+
+   if( p == 1.0 )
+      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
+   if( p == 2.0 )
+      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getVectorDifferenceSum( const Vector1& v1,
+                        const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] - data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+template< typename Device >
+   template< typename Vector1, typename Vector2, typename ResultType >
+ResultType
+CommonVectorOperations< Device >::
+getScalarProduct( const Vector1& v1,
+                  const Vector2& v2 )
+{
+   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
+   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
+
+   using RealType = typename Vector1::RealType;
+   using IndexType = typename Vector1::IndexType;
+
+   const auto* data1 = v1.getData();
+   const auto* data2 = v2.getData();
+   auto fetch = [=] __cuda_callable__ ( IndexType i ) { return  data1[ i ] * data2[ i ]; };
+   auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; };
+   auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; };
+   return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 );
+}
+
+} // namespace Algorithms
+} // namespace Containers
+} // namespace TNL
diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
index a5823849d443bd70831ddff6252631fb333d4aff..8701b8b7df3ade43bbf9f6cd195142915226d110 100644
--- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h
@@ -39,17 +39,23 @@ static constexpr int Reduction_registersPerThread = 32;   // empirically determi
    static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
 #endif
 
-template< int blockSize, typename Operation, typename Index >
+template< int blockSize,
+   typename Result,
+   typename DataFetcher,
+   typename Reduction,
+   typename VolatileReduction,
+   typename Index >
 __global__ void
 __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
-CudaReductionKernel( Operation operation,
+CudaReductionKernel( const Result zero,
+                     const DataFetcher dataFetcher,
+                     const Reduction reduction,
+                     const VolatileReduction volatileReduction,
                      const Index size,
-                     const typename Operation::DataType1* input1,
-                     const typename Operation::DataType2* input2,
-                     typename Operation::ResultType* output )
+                     Result* output )
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
+   using IndexType = Index;
+   using ResultType = Result;
 
    ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >();
 
@@ -62,55 +68,53 @@ CudaReductionKernel( Operation operation,
          IndexType gid = blockIdx.x * blockDim. x + threadIdx.x;
    const IndexType gridSize = blockDim.x * gridDim.x;
 
-   sdata[ tid ] = operation.initialValue();
+   sdata[ tid ] = zero;
+
    /***
     * Read data into the shared memory. We start with the
     * sequential reduction.
     */
    while( gid + 4 * gridSize < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) );
+      reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) );
       gid += 4 * gridSize;
    }
    while( gid + 2 * gridSize < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
-      operation.firstReduction( sdata[ tid ], gid + gridSize,     input1, input2 );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
+      reduction( sdata[ tid ], dataFetcher( gid + gridSize ) );
       gid += 2 * gridSize;
    }
    while( gid < size )
    {
-      operation.firstReduction( sdata[ tid ], gid,                input1, input2 );
+      reduction( sdata[ tid ], dataFetcher( gid ) );
       gid += gridSize;
    }
    __syncthreads();
 
-
    //printf( "1: tid %d data %f \n", tid, sdata[ tid ] );
-
-   //return;
    /***
     *  Perform the parallel reduction.
     */
    if( blockSize >= 1024 )
    {
       if( tid < 512 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] );
+         reduction( sdata[ tid ], sdata[ tid + 512 ] );
       __syncthreads();
    }
    if( blockSize >= 512 )
    {
       if( tid < 256 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] );
+         reduction( sdata[ tid ], sdata[ tid + 256 ] );
       __syncthreads();
    }
    if( blockSize >= 256 )
    {
       if( tid < 128 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] );
+         reduction( sdata[ tid ], sdata[ tid + 128 ] );
       __syncthreads();
       //printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
    }
@@ -118,48 +122,47 @@ CudaReductionKernel( Operation operation,
    if( blockSize >= 128 )
    {
       if( tid <  64 )
-         operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] );
+         reduction( sdata[ tid ], sdata[ tid + 64 ] );
       __syncthreads();
       //printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
    }
 
-
    /***
     * This runs in one warp so it is synchronized implicitly.
-    */   
+    */
    if( tid < 32 )
    {
       volatile ResultType* vsdata = sdata;
       if( blockSize >= 64 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] );
          //printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
       }
       // TODO: If blocksize == 32, the following does not work
       // We do not check if tid < 16. Fix it!!!
       if( blockSize >= 32 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] );
          //printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >= 16 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] );
          //printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  8 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] );
          //printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  4 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] );
          //printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
       }
       if( blockSize >=  2 )
       {
-         operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
+         volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] );
          //printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
       }
    }
@@ -175,17 +178,14 @@ CudaReductionKernel( Operation operation,
 
 }
 
-template< typename Operation, typename Index >
-int
-CudaReductionKernelLauncher( Operation& operation,
-                             const Index size,
-                             const typename Operation::DataType1* input1,
-                             const typename Operation::DataType2* input2,
-                             typename Operation::ResultType*& output )
+template< typename Index,
+          typename Result >
+struct CudaReductionKernelLauncher
 {
-   typedef Index IndexType;
-   typedef typename Operation::ResultType ResultType;
+   using IndexType = Index;
+   using ResultType = Result;
 
+   ////
    // The number of blocks should be a multiple of the number of multiprocessors
    // to ensure optimum balancing of the load. This is very important, because
    // we run the kernel with a fixed number of blocks, so the amount of work per
@@ -195,93 +195,159 @@ CudaReductionKernelLauncher( Operation& operation,
    // where blocksPerMultiprocessor is determined according to the number of
    // available registers on the multiprocessor.
    // On Tesla K40c, desGridSize = 8 * 15 = 120.
-   const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
-   const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
-                                      / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
-   const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
-   dim3 blockSize, gridSize;
-   blockSize.x = Reduction_maxThreadsPerBlock;
-   gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
-
-   // create reference to the reduction buffer singleton and set size
-   const size_t buf_size = desGridSize * sizeof( ResultType );
-   CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
-   cudaReductionBuffer.setSize( buf_size );
-   output = cudaReductionBuffer.template getData< ResultType >();
-
-   // when there is only one warp per blockSize.x, we need to allocate two warps
-   // worth of shared memory so that we don't index shared memory out of bounds
-   const IndexType shmem = (blockSize.x <= 32)
-            ? 2 * blockSize.x * sizeof( ResultType )
-            : blockSize.x * sizeof( ResultType );
+   CudaReductionKernelLauncher( const Index size )
+   : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
+     blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
+                               / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
+     desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
+     originalSize( size )
+   {
+   }
 
-   /***
-    * Depending on the blockSize we generate appropriate template instance.
-    */
-   switch( blockSize.x )
+   template< typename DataFetcher,
+             typename Reduction,
+             typename VolatileReduction >
+   int start( const Reduction& reduction,
+              const VolatileReduction& volatileReduction,
+              const DataFetcher& dataFetcher,
+              const Result& zero,
+              ResultType*& output )
    {
-      case 512:
-         CudaReductionKernel< 512 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case 256:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel< 256 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case 128:
-         cudaFuncSetCacheConfig(CudaReductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel< 128 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  64:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  64, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  64 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  32:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  32, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  32 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case  16:
-         cudaFuncSetCacheConfig(CudaReductionKernel<  16, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<  16 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-     case   8:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   8, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   8 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case   4:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   4, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   4 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case   2:
-         cudaFuncSetCacheConfig(CudaReductionKernel<   2, Operation, Index >, cudaFuncCachePreferShared);
-
-         CudaReductionKernel<   2 >
-         <<< gridSize, blockSize, shmem >>>( operation, size, input1, input2, output);
-         break;
-      case   1:
-         TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
-      default:
-         TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+      ////
+      // create reference to the reduction buffer singleton and set size
+      const size_t buf_size = 2 * desGridSize * sizeof( ResultType );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      cudaReductionBuffer.setSize( buf_size );
+      output = cudaReductionBuffer.template getData< ResultType >();
+
+      this-> reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output );
+      return this->reducedSize;
    }
-   TNL_CHECK_CUDA_DEVICE;
 
-   // return the size of the output array on the CUDA device
-   return gridSize.x;
-}
+   template< typename Reduction,
+             typename VolatileReduction >
+   Result finish( const Reduction& reduction,
+                  const VolatileReduction& volatileReduction,
+                  const Result& zero )
+   {
+      ////
+      // Input is the first half of the buffer, output is the second half
+      const size_t buf_size = desGridSize * sizeof( ResultType );
+      CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance();
+      ResultType* input = cudaReductionBuffer.template getData< ResultType >();
+      ResultType* output = &input[ buf_size ];
+
+      auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return input[ i ]; };
+      while( this->reducedSize > 1 )
+      {
+         this-> reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output );
+         std::swap( input, output );
+      }
+
+      ////
+      // Copy result on CPU
+      ResultType result;
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( &result, output, 1 );
+      return result;
+   }
+
+   protected:
+      template< typename DataFetcher,
+                typename Reduction,
+                typename VolatileReduction >
+      int launch( const Index size,
+                  const Reduction& reduction,
+                  const VolatileReduction& volatileReduction,
+                  const DataFetcher& dataFetcher,
+                  const Result& zero,
+                  Result* output )
+      {
+         dim3 blockSize, gridSize;
+         blockSize.x = Reduction_maxThreadsPerBlock;
+         gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
+
+         ////
+         // when there is only one warp per blockSize.x, we need to allocate two warps
+         // worth of shared memory so that we don't index shared memory out of bounds
+         const IndexType shmem = (blockSize.x <= 32)
+                  ? 2 * blockSize.x * sizeof( ResultType )
+                  : blockSize.x * sizeof( ResultType );
+
+        /***
+         * Depending on the blockSize we generate appropriate template instance.
+         */
+        switch( blockSize.x )
+        {
+           case 512:
+              CudaReductionKernel< 512 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 256:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 256 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case 128:
+              cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel< 128 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  64:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  64 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  32:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  32 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case  16:
+              cudaFuncSetCacheConfig(CudaReductionKernel<  16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<  16 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+          case   8:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   8 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   4:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   4 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   2:
+              cudaFuncSetCacheConfig(CudaReductionKernel<   2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared);
+
+              CudaReductionKernel<   2 >
+              <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output);
+              break;
+           case   1:
+              TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl );
+           default:
+              TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
+        }
+        TNL_CHECK_CUDA_DEVICE;
+
+        ////
+        // return the size of the output array on the CUDA device
+        return gridSize.x;
+      }
+
+      const int activeDevice;
+      const int blocksdPerMultiprocessor;
+      const int desGridSize;
+      const IndexType originalSize;
+      IndexType reducedSize;
+};
 #endif
 
 } // namespace Algorithms
diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h
index d4f45f30e435590629475d107254822485b33979..7ac0d4c028a5365430b67707d63ca185cca4a275 100644
--- a/src/TNL/Containers/Algorithms/Reduction.h
+++ b/src/TNL/Containers/Algorithms/Reduction.h
@@ -10,7 +10,7 @@
 
 // Implemented by: Tomas Oberhuber, Jakub Klinkovsky
 
-#pragma once 
+#pragma once
 
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
@@ -18,7 +18,7 @@
 
 namespace TNL {
 namespace Containers {
-namespace Algorithms {   
+namespace Algorithms {
 
 template< typename Device >
 class Reduction
@@ -29,40 +29,55 @@ template<>
 class Reduction< Devices::Cuda >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 template<>
 class Reduction< Devices::Host >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 template<>
 class Reduction< Devices::MIC >
 {
 public:
-   template< typename Operation, typename Index >
-   static typename Operation::ResultType
-   reduce( Operation& operation,
-           const Index size,
-           const typename Operation::DataType1* deviceInput1,
-           const typename Operation::DataType2* deviceInput2 );
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename VolatileReductionOperation,
+             typename DataFetcher >
+   static Result
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
 };
 
 } // namespace Algorithms
 } // namespace Containers
 } // namespace TNL
 
-#include "Reduction_impl.h"
+#include "Reduction.hpp"
diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp
similarity index 57%
rename from src/TNL/Containers/Algorithms/Reduction_impl.h
rename to src/TNL/Containers/Algorithms/Reduction.hpp
index b6a8562497734c7a68222300f5ebed64a7be6941..31c93504cc18f3616171b855a286345953ca19d0 100644
--- a/src/TNL/Containers/Algorithms/Reduction_impl.h
+++ b/src/TNL/Containers/Algorithms/Reduction.hpp
@@ -38,35 +38,39 @@ namespace Algorithms {
  */
 static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
 
-template< typename Operation, typename Index >
-typename Operation::ResultType
+
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename VolatileReductionOperation,
+          typename DataFetcher >
+Result
 Reduction< Devices::Cuda >::
-reduce( Operation& operation,
-        const Index size,
-        const typename Operation::DataType1* deviceInput1,
-        const typename Operation::DataType2* deviceInput2 )
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero )
 {
 #ifdef HAVE_CUDA
 
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
-   typedef typename Operation::LaterReductionOperation LaterReductionOperation;
+   using IndexType = Index;
+   using ResultType = Result;
 
    /***
     * Only fundamental and pointer types can be safely reduced on host. Complex
     * objects stored on the device might contain pointers into the device memory,
     * in which case reduction on host might fail.
     */
-   constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
+   //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value;
    constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value;
 
    /***
     * First check if the input array(s) is/are large enough for the reduction on GPU.
     * Otherwise copy it/them to host and reduce on CPU.
     */
-   if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
+   // With lambda function we cannot reduce data on host - we do not know the data here.
+   /*if( can_reduce_all_on_host && size <= Reduction_minGpuDataSize )
    {
       typename std::remove_const< DataType1 >::type hostArray1[ Reduction_minGpuDataSize ];
       ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray1, deviceInput1, size );
@@ -74,12 +78,12 @@ reduce( Operation& operation,
          using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type;
          typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ];
          ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size );
-         return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 );
+         return Reduction< Devices::Host >::reduce( zero, dataFetcher, reduction, size, hostArray1, hostArray2 );
       }
       else {
          return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr );
       }
-   }
+   }*/
 
    #ifdef CUDA_REDUCTION_PROFILING
       Timer timer;
@@ -87,15 +91,18 @@ reduce( Operation& operation,
       timer.start();
    #endif
 
+   CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size );
+
    /****
     * Reduce the data on the CUDA device.
     */
    ResultType* deviceAux1( 0 );
-   IndexType reducedSize = CudaReductionKernelLauncher( operation,
-                                                        size,
-                                                        deviceInput1,
-                                                        deviceInput2,
-                                                        deviceAux1 );
+   IndexType reducedSize = reductionLauncher.start( 
+      reduction,
+      volatileReduction,
+      dataFetcher,
+      zero,
+      deviceAux1 );
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
       std::cout << "   Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl;
@@ -107,8 +114,8 @@ reduce( Operation& operation,
       /***
        * Transfer the reduced data from device to host.
        */
-      ResultType resultArray[ reducedSize ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+      std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] };
+      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -120,28 +127,20 @@ reduce( Operation& operation,
       /***
        * Reduce the data on the host system.
        */
-      LaterReductionOperation laterReductionOperation;
-      const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr );
+      auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; };
+      const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl;
       #endif
-
       return result;
    }
    else {
       /***
        * Data can't be safely reduced on host, so continue with the reduction on the CUDA device.
        */
-      LaterReductionOperation laterReductionOperation;
-      while( reducedSize > 1 ) {
-         reducedSize = CudaReductionKernelLauncher( laterReductionOperation,
-                                                    reducedSize,
-                                                    deviceAux1,
-                                                    (ResultType*) 0,
-                                                    deviceAux1 );
-      }
+      auto result = reductionLauncher.finish( reduction, volatileReduction, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -150,14 +149,14 @@ reduce( Operation& operation,
          timer.start();
       #endif
 
-      ResultType resultArray[ 1 ];
-      ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
-      const ResultType result = resultArray[ 0 ];
+      //ResultType resultArray[ 1 ];
+      //ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize );
+      //const ResultType result = resultArray[ 0 ];
 
-      #ifdef CUDA_REDUCTION_PROFILING
+      /*#ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
          std::cout << "   Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl;
-      #endif
+      #endif*/
 
       return result;
    }
@@ -166,18 +165,21 @@ reduce( Operation& operation,
 #endif
 };
 
-template< typename Operation, typename Index >
-typename Operation::ResultType
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename VolatileReductionOperation,
+          typename DataFetcher >
+Result
 Reduction< Devices::Host >::
-reduce( Operation& operation,
-        const Index size,
-        const typename Operation::DataType1* input1,
-        const typename Operation::DataType2* input2 )
+   reduce( const Index size,
+           ReductionOperation& reduction,
+           VolatileReductionOperation& volatileReduction,
+           DataFetcher& dataFetcher,
+           const Result& zero )
 {
-   typedef Index IndexType;
-   typedef typename Operation::DataType1 DataType1;
-   typedef typename Operation::DataType2 DataType2;
-   typedef typename Operation::ResultType ResultType;
+   using IndexType = Index;
+   using ResultType = Result;
 
    constexpr int block_size = 128;
    const int blocks = size / block_size;
@@ -185,23 +187,20 @@ reduce( Operation& operation,
 #ifdef HAVE_OPENMP
    if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) {
       // global result variable
-      ResultType result = operation.initialValue();
+      ResultType result = zero;
 #pragma omp parallel
       {
          // initialize array for thread-local results
-         ResultType r[ 4 ] = { operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue() };
+         ResultType r[ 4 ] = { zero, zero, zero, zero  };
 
          #pragma omp for nowait
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
-               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
@@ -209,18 +208,18 @@ reduce( Operation& operation,
          #pragma omp single nowait
          {
             for( IndexType i = blocks * block_size; i < size; i++ )
-               operation.firstReduction( r[ 0 ], i, input1, input2 );
+               reduction( r[ 0 ], dataFetcher( i ) );
          }
 
          // local reduction of unrolled results
-         operation.commonReduction( r[ 0 ], r[ 2 ] );
-         operation.commonReduction( r[ 1 ], r[ 3 ] );
-         operation.commonReduction( r[ 0 ], r[ 1 ] );
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
 
          // inter-thread reduction of local results
          #pragma omp critical
          {
-            operation.commonReduction( result, r[ 0 ] );
+            reduction( result, r[ 0 ] );
          }
       }
       return result;
@@ -229,37 +228,34 @@ reduce( Operation& operation,
 #endif
       if( blocks > 1 ) {
          // initialize array for unrolled results
-         ResultType r[ 4 ] = { operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue(),
-                               operation.initialValue() };
+         ResultType r[ 4 ] = { zero, zero, zero, zero };
 
          // main reduction (explicitly unrolled loop)
          for( int b = 0; b < blocks; b++ ) {
             const IndexType offset = b * block_size;
             for( int i = 0; i < block_size; i += 4 ) {
-               operation.firstReduction( r[ 0 ], offset + i,     input1, input2 );
-               operation.firstReduction( r[ 1 ], offset + i + 1, input1, input2 );
-               operation.firstReduction( r[ 2 ], offset + i + 2, input1, input2 );
-               operation.firstReduction( r[ 3 ], offset + i + 3, input1, input2 );
+               reduction( r[ 0 ], dataFetcher( offset + i ) );
+               reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+               reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+               reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
             }
          }
 
          // reduction of the last, incomplete block (not unrolled)
          for( IndexType i = blocks * block_size; i < size; i++ )
-            operation.firstReduction( r[ 0 ], i, input1, input2 );
+            reduction( r[ 0 ], dataFetcher( i ) );
+            //operation.dataFetcher( r[ 0 ], i, input1, input2 );
 
          // reduction of unrolled results
-         operation.commonReduction( r[ 0 ], r[ 2 ] );
-         operation.commonReduction( r[ 1 ], r[ 3 ] );
-         operation.commonReduction( r[ 0 ], r[ 1 ] );
-
+         reduction( r[ 0 ], r[ 2 ] );
+         reduction( r[ 1 ], r[ 3 ] );
+         reduction( r[ 0 ], r[ 1 ] );
          return r[ 0 ];
       }
       else {
-         ResultType result = operation.initialValue();
+         ResultType result = zero;
          for( IndexType i = 0; i < size; i++ )
-            operation.firstReduction( result, i, input1, input2 );
+            reduction( result, dataFetcher( i ) );
          return result;
       }
 #ifdef HAVE_OPENMP
diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h
index a3ef89fc3213c81db53a6f8a0f867379734b588e..1c89dbca036ba7fecba775622890b8cf833b3f51 100644
--- a/src/TNL/Containers/Algorithms/VectorOperations.h
+++ b/src/TNL/Containers/Algorithms/VectorOperations.h
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <TNL/Containers/Algorithms/Reduction.h>
+#include <TNL/Containers/Algorithms/CommonVectorOperations.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -22,7 +23,7 @@ template< typename Device >
 class VectorOperations{};
 
 template<>
-class VectorOperations< Devices::Host >
+class VectorOperations< Devices::Host > : public CommonVectorOperations< Devices::Host >
 {
 public:
    template< typename Vector >
@@ -36,7 +37,7 @@ public:
                            const typename Vector::RealType& value,
                            const Scalar thisElementMultiplicator );
 
-   template< typename Vector, typename ResultType = typename Vector::RealType >
+   /*template< typename Vector, typename ResultType = typename Vector::RealType >
    static ResultType getVectorMax( const Vector& v );
 
    template< typename Vector, typename ResultType = typename Vector::RealType >
@@ -83,12 +84,13 @@ public:
 
    template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
    static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+    */
 
    template< typename Vector, typename Scalar >
    static void vectorScalarMultiplication( Vector& v, Scalar alpha );
 
-   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
-   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+   /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/
 
    template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
    static void addVector( Vector1& y,
@@ -116,7 +118,7 @@ public:
 };
 
 template<>
-class VectorOperations< Devices::Cuda >
+class VectorOperations< Devices::Cuda > : public CommonVectorOperations< Devices::Cuda >
 {
 public:
    template< typename Vector >
@@ -130,7 +132,7 @@ public:
                            const typename Vector::RealType& value,
                            const Scalar thisElementMultiplicator );
 
-   template< typename Vector, typename ResultType = typename Vector::RealType >
+   /*template< typename Vector, typename ResultType = typename Vector::RealType >
    static ResultType getVectorMax( const Vector& v );
 
    template< typename Vector, typename ResultType = typename Vector::RealType >
@@ -177,12 +179,13 @@ public:
 
    template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
    static ResultType getVectorDifferenceSum( const Vector1& v1, const Vector2& v2 );
+    */
 
    template< typename Vector, typename Scalar >
    static void vectorScalarMultiplication( Vector& v, const Scalar alpha );
 
-   template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
-   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );
+   /*template< typename Vector1, typename Vector2, typename ResultType = typename Vector1::RealType >
+   static ResultType getScalarProduct( const Vector1& v1, const Vector2& v2 );*/
 
    template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
    static void addVector( Vector1& y,
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
index 05f334b051dc5ef3f7471bd3a1cef32aebe607e0..d79b5f06909d08cb918336ad3225c0fa206443d7 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h
@@ -39,7 +39,7 @@ addElement( Vector& v,
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
-template< typename Vector, typename ResultType >
+/*template< typename Vector, typename ResultType >
 ResultType
 VectorOperations< Devices::Cuda >::
 getVectorMax( const Vector& v )
@@ -314,7 +314,7 @@ getVectorDifferenceSum( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 #ifdef HAVE_CUDA
 template< typename Real, typename Index, typename Scalar >
@@ -358,7 +358,7 @@ vectorScalarMultiplication( Vector& v,
 }
 
 
-template< typename Vector1, typename Vector2, typename ResultType >
+/*template< typename Vector1, typename Vector2, typename ResultType >
 ResultType
 VectorOperations< Devices::Cuda >::
 getScalarProduct( const Vector1& v1,
@@ -372,7 +372,7 @@ getScalarProduct( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 #ifdef HAVE_CUDA
 template< typename Real1, typename Real2, typename Index, typename Scalar1, typename Scalar2 >
diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
index 67400da64305767c86a85f600759f7f6ddafa691..7b1bba80c4cd619275bda20d537c5646ec154a0b 100644
--- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
+++ b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h
@@ -41,283 +41,6 @@ addElement( Vector& v,
    v[ i ] = thisElementMultiplicator * v[ i ] + value;
 }
 
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorMax( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionMax< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorMin( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionMin< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorAbsMax( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionAbsMax< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorAbsMin( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionAbsMin< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorL1Norm( const Vector& v )
-{
-   typedef typename Vector::RealType RealType;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionAbsSum< RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( RealType* ) 0 );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorL2Norm( const Vector& v )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionL2Norm< Real, ResultType > operation;
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v.getSize(),
-                                                                 v.getData(),
-                                                                 ( Real* ) 0 );
-   return std::sqrt( result );
-}
-
-template< typename Vector, typename ResultType, typename Scalar >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorLpNorm( const Vector& v,
-                 const Scalar p )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
-
-   if( p == 1.0 )
-      return getVectorL1Norm< Vector, ResultType >( v );
-   if( p == 2.0 )
-      return getVectorL2Norm< Vector, ResultType >( v );
-
-   Algorithms::ParallelReductionLpNorm< Real, ResultType, Scalar > operation;
-   operation.setPower( p );
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v.getSize(),
-                                                                 v.getData(),
-                                                                 ( Real* ) 0 );
-   return std::pow( result, 1.0 / p );
-}
-
-template< typename Vector, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorSum( const Vector& v )
-{
-   typedef typename Vector::RealType Real;
-
-   TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." );
-
-   Algorithms::ParallelReductionSum< Real, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v.getSize(),
-                                              v.getData(),
-                                              ( Real* ) 0 );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceMax( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceMin( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceAbsMax( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsMax< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceAbsMin( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsMin< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceL1Norm( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffAbsSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceL2Norm( const Vector1& v1,
-                           const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffL2Norm< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v1.getSize(),
-                                                                 v1.getData(),
-                                                                 v2.getData() );
-   return std::sqrt( result );
-}
-
-
-template< typename Vector1, typename Vector2, typename ResultType, typename Scalar >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceLpNorm( const Vector1& v1,
-                           const Vector2& v2,
-                           const Scalar p )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-   TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." );
-
-   if( p == 1.0 )
-      return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 );
-   if( p == 2.0 )
-      return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 );
-
-   Algorithms::ParallelReductionDiffLpNorm< typename Vector1::RealType, typename Vector2::RealType, ResultType, Scalar > operation;
-   operation.setPower( p );
-   const ResultType result = Reduction< Devices::Host >::reduce( operation,
-                                                                 v1.getSize(),
-                                                                 v1.getData(),
-                                                                 v2.getData() );
-   return std::pow( result, 1.0 / p );
-}
-
-template< typename Vector1, typename Vector2, typename ResultType >
-ResultType
-VectorOperations< Devices::Host >::
-getVectorDifferenceSum( const Vector1& v1,
-                        const Vector2& v2 )
-{
-   TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." );
-   TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." );
-
-   Algorithms::ParallelReductionDiffSum< typename Vector1::RealType, typename Vector2::RealType, ResultType > operation;
-   return Reduction< Devices::Host >::reduce( operation,
-                                              v1.getSize(),
-                                              v1.getData(),
-                                              v2.getData() );
-}
 
 
 template< typename Vector, typename Scalar >
@@ -339,7 +62,7 @@ vectorScalarMultiplication( Vector& v,
 }
 
 
-template< typename Vector1, typename Vector2, typename ResultType >
+/*template< typename Vector1, typename Vector2, typename ResultType >
 ResultType
 VectorOperations< Devices::Host >::
 getScalarProduct( const Vector1& v1,
@@ -353,7 +76,7 @@ getScalarProduct( const Vector1& v1,
                                               v1.getSize(),
                                               v1.getData(),
                                               v2.getData() );
-}
+}*/
 
 template< typename Vector1, typename Vector2, typename Scalar1, typename Scalar2 >
 void
diff --git a/src/TNL/Math.h b/src/TNL/Math.h
index 68eb1f556f246c2a3e0909e0f58ebd2ee57b17dd..dcfa91a3450d2ef33b0475edb77f6801dfea96e0 100644
--- a/src/TNL/Math.h
+++ b/src/TNL/Math.h
@@ -8,7 +8,7 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
-#pragma once 
+#pragma once
 
 #include <cmath>
 #include <type_traits>
@@ -18,6 +18,13 @@
 
 namespace TNL {
 
+template< typename T1, typename T2, typename ResultType = typename std::common_type< T1, T2 >::type >
+__cuda_callable__ inline
+ResultType sum( const T1& a, const T2& b )
+{
+   return a + b;
+}
+
 /**
  * \brief This function returns minimum of two numbers.
  *
@@ -105,7 +112,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t
 __cuda_callable__
 ResultType argMax( const T1& a, const T2& b )
 {
-   return ( a > b ) ?  a : b;   
+   return ( a > b ) ?  a : b;
 }
 
 /***
@@ -125,7 +132,7 @@ template< typename T1, typename T2, typename ResultType = typename std::common_t
 __cuda_callable__
 ResultType argAbsMax( const T1& a, const T2& b )
 {
-   return ( TNL::abs( a ) > TNL::abs( b ) ) ?  a : b;   
+   return ( TNL::abs( a ) > TNL::abs( b ) ) ?  a : b;
 }
 
 /**