From 7ad99cc2d67ab0e8315502396d8a5cff2dd0225d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Fri, 2 Jul 2021 17:40:36 +0200
Subject: [PATCH 01/34] Added Functional.h - replacemant of STL functional for
 parallel reduction.

---
 src/TNL/Algorithms/Reduction.h             |  72 +++++++++--
 src/TNL/Functional.h                       | 134 +++++++++++++++++++++
 src/UnitTests/Algorithms/CMakeLists.txt    |   1 +
 src/UnitTests/Algorithms/ReductionTest.cpp |   1 +
 src/UnitTests/Algorithms/ReductionTest.cu  |   1 +
 src/UnitTests/Algorithms/ReductionTest.h   |  45 +++++++
 src/UnitTests/CMakeLists.txt               |   9 +-
 7 files changed, 255 insertions(+), 8 deletions(-)
 create mode 100644 src/TNL/Functional.h
 create mode 100644 src/UnitTests/Algorithms/ReductionTest.cpp
 create mode 100644 src/UnitTests/Algorithms/ReductionTest.cu
 create mode 100644 src/UnitTests/Algorithms/ReductionTest.h
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index d928ec687..f394ce86c 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -13,8 +13,9 @@
 #pragma once
 
 #include <utility>  // std::pair
-#include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
+#include <functional>        // reduction functions like std::plus, std::logical_and, std::logical_or etc. - deprecated
 
+#include <TNL/Functional.h>  // replacement of STL functional
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
@@ -41,7 +42,7 @@ template<>
 struct Reduction< Devices::Sequential >
 {
    /**
-    * \brief Computes reduction on CPU sequentialy.
+    * \brief Computes reduction on CPU sequentially.
     *
     * \tparam Index is a type for indexing.
     * \tparam Result is a type of the reduction result.
@@ -85,7 +86,7 @@ struct Reduction< Devices::Sequential >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& zero = Reduce::template getIdempotent< Result >() );
 
    /**
     * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
@@ -137,7 +138,7 @@ struct Reduction< Devices::Sequential >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& zero = Reduce::idempotent );
 };
 
 template<>
@@ -190,6 +191,20 @@ struct Reduction< Devices::Host >
            Reduce&& reduce,
            const Result& zero );
 
+   template< typename Index,
+             typename Fetch,
+             typename Reduce_ >
+   static auto
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce_&& reduce_ ) -> decltype( fetch( ( Index ) 0 ) )
+   {
+      using Result = decltype( fetch( ( Index ) 0 ) );
+      return reduce( begin, end, fetch, reduce_, std::remove_reference< Reduce_ >::type::template getIdempotent< Result >() );
+   };
+
+
    /**
     * \brief Computes reduction on CPU and returns position of an element of interest.
     *
@@ -240,7 +255,7 @@ struct Reduction< Devices::Host >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& zero = Reduce::idempotent );
 };
 
 template<>
@@ -291,7 +306,7 @@ struct Reduction< Devices::Cuda >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& zero = Reduce::idempotent );
 
    /**
     * \brief Computes reduction on GPU and returns position of an element of interest.
@@ -343,9 +358,52 @@ struct Reduction< Devices::Cuda >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero );
+                       const Result& zero = Reduce::idempotent );
 };
 
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduce >
+Result reduce( const Index begin,
+               const Index end,
+               Fetch&& fetch,
+               Reduce&& reduce,
+               const Result& zero )
+{
+    return Reduction< Device >::reduce( begin, end, fetch, reduce, zero );
+}
+
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduce >
+auto reduce( const Index begin,
+             const Index end,
+             Fetch&& fetch,
+             Reduce&& reduce ) -> decltype( Reduction< Device >::reduce( begin, end, fetch, reduce ) )
+{
+   return Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ) );
+}
+
+
+template< typename Device,
+          typename Index,
+          typename Result,
+          typename Fetch,
+          typename Reduce >
+std::pair< Result, Index >
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduce&& reduce,
+                    const Result& zero = Reduce::template getIdempotent< Result >() )
+{
+    return Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce, zero );
+}
+
+
 } // namespace Algorithms
 } // namespace TNL
 
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
new file mode 100644
index 000000000..6a0589bd3
--- /dev/null
+++ b/src/TNL/Functional.h
@@ -0,0 +1,134 @@
+/***************************************************************************
+                          Functional.h  -  description
+                             -------------------
+    begin                : Juyl 1, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#include <limits>
+
+namespace TNL {
+
+/*template< typename Value,
+          int size = sizeof( Value ) >
+struct AllBitsTrue
+{
+   static constexpr Value aux = AllBitsTrue< Value, size - 1 >::value << 8;
+   static constexpr Value value = ( Value ) aux | 0xff;
+};
+
+template< typename Value >
+struct AllBitsTrue< Value, 1 >
+{
+   static constexpr Value value = ( Value ) 0xff;
+};
+
+template< typename Value,
+          int size = sizeof( Value ) >
+struct AllBitsFalse
+{
+   static constexpr Value aux = AllBitsFalse< Value, size - 1 >::value << 8;
+   static constexpr Value value = ( Value ) aux | 0x00;
+};
+
+template< typename Value >
+struct AllBitsFalse< Value, 1 >
+{
+   static constexpr Value value = ( Value ) 0x00;
+};*/
+
+
+template< typename Value = void >
+struct Plus
+{
+   using ValueType = Value;
+
+   static constexpr Value getIdempotent() { return ( Value ) 0; };
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs + rhs; }
+};
+
+template<>
+struct Plus< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) 0; };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs + rhs; }
+};
+
+template< typename Value = void >
+struct Multiplies
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = 1;
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs * rhs; }
+};
+
+template< typename Value = void >
+struct Min
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs < rhs ? lhs : rhs; }
+};
+
+template< typename Value = void >
+struct Max
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs > rhs ? lhs : rhs; }
+};
+
+template< typename Value = void >
+struct LogicalAnd
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = ( Value ) true;
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs && rhs; }
+};
+
+template< typename Value = void >
+struct LogicalOr
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = ( Value ) false;
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs || rhs; }
+};
+
+template< typename Value = void >
+struct BitAnd
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = ~static_cast< ValueType >( 0 );
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs & rhs; }
+};
+
+template< typename Value = void >
+struct BitOr
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent =  static_cast< ValueType >( 0 );
+
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs | rhs; }
+};
+
+} // namespace TNL
diff --git a/src/UnitTests/Algorithms/CMakeLists.txt b/src/UnitTests/Algorithms/CMakeLists.txt
index 1e4361f49..14a7d43ab 100644
--- a/src/UnitTests/Algorithms/CMakeLists.txt
+++ b/src/UnitTests/Algorithms/CMakeLists.txt
@@ -4,6 +4,7 @@ set( COMMON_TESTS
          MemoryOperationsTest
          MultireductionTest
          ParallelForTest
+         ReductionTest
          staticForTest
          unrolledForTest
 )
diff --git a/src/UnitTests/Algorithms/ReductionTest.cpp b/src/UnitTests/Algorithms/ReductionTest.cpp
new file mode 100644
index 000000000..4d630e5f9
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.cpp
@@ -0,0 +1 @@
+#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.cu b/src/UnitTests/Algorithms/ReductionTest.cu
new file mode 100644
index 000000000..4d630e5f9
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.cu
@@ -0,0 +1 @@
+#include "ReductionTest.h"
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
new file mode 100644
index 000000000..adc3b268e
--- /dev/null
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -0,0 +1,45 @@
+/***************************************************************************
+                          ReductionTest.h  -  description
+                             -------------------
+    begin                : Jul 2, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+#include <TNL/Containers/Array.h>
+#include <TNL/Algorithms/Reduction.h>
+
+#ifdef HAVE_GTEST
+#include <gtest/gtest.h>
+#endif
+
+using namespace TNL;
+
+#ifdef HAVE_GTEST
+TEST( ReduceTest, sum )
+{
+   using Array = Containers::Array< int, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000; size *= 10 )
+   {
+      a.setSize( size );
+      a.setValue( 1 );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Plus<>{} );
+      EXPECT_EQ( res, size );
+
+   }
+}
+
+
+#endif
+
+#include "../main.h"
diff --git a/src/UnitTests/CMakeLists.txt b/src/UnitTests/CMakeLists.txt
index 04a3a4f00..d50b682ba 100644
--- a/src/UnitTests/CMakeLists.txt
+++ b/src/UnitTests/CMakeLists.txt
@@ -6,7 +6,14 @@ ADD_SUBDIRECTORY( Functions )
 ADD_SUBDIRECTORY( Meshes )
 ADD_SUBDIRECTORY( Pointers )
 
-set( CPP_TESTS  AssertTest base64Test FileNameTest MathTest ObjectTest StringTest TimerTest TypeInfoTest )
+set( CPP_TESTS  AssertTest
+                base64Test
+                FileNameTest
+                MathTest
+                ObjectTest
+                StringTest
+                TimerTest
+                TypeInfoTest )
 set( CUDA_TESTS  AssertCudaTest )
 if( BUILD_CUDA )
    set( CUDA_TESTS  ${CUDA_TESTS} AllocatorsTest FileTest )
-- 
GitLab


From 9ff98cf4acf35e63bf9f7e6217b406f1436e028c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 11:33:03 +0200
Subject: [PATCH 02/34] Added specializations of functionals for void.

---
 src/TNL/Functional.h | 73 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 6a0589bd3..6a40088bb 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -71,6 +71,17 @@ struct Multiplies
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs * rhs; }
 };
 
+template<>
+struct Multiplies< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) 1; };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs * rhs; }
+};
+
+
 template< typename Value = void >
 struct Min
 {
@@ -81,6 +92,17 @@ struct Min
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs < rhs ? lhs : rhs; }
 };
 
+template<>
+struct Min< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs < rhs ? lhs : rhs; }
+};
+
+
 template< typename Value = void >
 struct Max
 {
@@ -91,6 +113,16 @@ struct Max
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs > rhs ? lhs : rhs; }
 };
 
+template<>
+struct Max< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs > rhs ? lhs : rhs; }
+};
+
 template< typename Value = void >
 struct LogicalAnd
 {
@@ -101,6 +133,16 @@ struct LogicalAnd
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs && rhs; }
 };
 
+template<>
+struct LogicalAnd< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) true; };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs && rhs; }
+};
+
 template< typename Value = void >
 struct LogicalOr
 {
@@ -110,6 +152,16 @@ struct LogicalOr
 
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs || rhs; }
 };
+template<>
+struct LogicalOr< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ( T ) false; };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs || rhs; }
+};
+
 
 template< typename Value = void >
 struct BitAnd
@@ -121,6 +173,16 @@ struct BitAnd
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs & rhs; }
 };
 
+template<>
+struct BitAnd< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs & rhs; }
+};
+
 template< typename Value = void >
 struct BitOr
 {
@@ -131,4 +193,15 @@ struct BitOr
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs | rhs; }
 };
 
+template<>
+struct BitOr< void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return static_cast< T >( 0 ); };
+
+   template< typename T >
+   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs | rhs; }
+};
+
+
 } // namespace TNL
-- 
GitLab


From 200685d6205e30a9557de3eab7f44e7b932f9dd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 11:33:21 +0200
Subject: [PATCH 03/34] Added reduction unit tests.

---
 src/UnitTests/Algorithms/ReductionTest.h | 97 +++++++++++++++++++++++-
 1 file changed, 96 insertions(+), 1 deletion(-)

diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
index adc3b268e..2b30f21dd 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -26,7 +26,7 @@ TEST( ReduceTest, sum )
 {
    using Array = Containers::Array< int, Devices::Host >;
    Array a;
-   for( int size = 100; size <= 1000; size *= 10 )
+   for( int size = 100; size <= 1000000; size *= 10 )
    {
       a.setSize( size );
       a.setValue( 1 );
@@ -35,10 +35,105 @@ TEST( ReduceTest, sum )
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
       auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Plus<>{} );
       EXPECT_EQ( res, size );
+   }
+}
+
+TEST( ReduceTest, min )
+{
+   using Array = Containers::Array< int, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Min<>{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+TEST( ReduceTest, max )
+{
+   using Array = Containers::Array< int, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Max<>{} );
+      EXPECT_EQ( res, size );
+   }
+}
+
+
+TEST( ReduceTest, logicalAnd )
+{
+   using Array = Containers::Array< bool, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::LogicalAnd<>{} );
+      EXPECT_EQ( res, false );
+   }
+}
+
+TEST( ReduceTest, logicalOr )
+{
+   using Array = Containers::Array< bool, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, bool& value ) { value = ( bool ) ( idx % 2 ); } );
+      auto a_view = a.getView();
 
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::LogicalOr<>{} );
+      EXPECT_EQ( res, true );
    }
 }
 
+TEST( ReduceTest, bitAnd )
+{
+   using Array = Containers::Array< char, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 | ( 1 << ( idx % 8 ) ); } );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::BitAnd<>{} );
+      EXPECT_EQ( res, 1 );
+   }
+}
+
+TEST( ReduceTest, bitOr )
+{
+   using Array = Containers::Array< char, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, char& value ) { value = 1 << ( idx % 8 );} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::BitOr<>{} );
+      EXPECT_EQ( res, ( char ) 255 );
+   }
+}
 
 #endif
 
-- 
GitLab


From 8e1e2a06725623995b92999fc8bf8f787e03168a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 13:14:59 +0200
Subject: [PATCH 04/34] Added reduction unit tests for CUDA.

---
 src/UnitTests/Algorithms/ReductionTest.h | 33 +++++++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
index 2b30f21dd..6eb32ba47 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -22,7 +22,26 @@
 using namespace TNL;
 
 #ifdef HAVE_GTEST
-TEST( ReduceTest, sum )
+
+// test fixture for typed tests
+template< typename Device >
+class ReduceTest : public ::testing::Test
+{
+protected:
+   using DeviceType = Device;
+};
+
+// types for which ArrayTest is instantiated
+using DeviceTypes = ::testing::Types<
+   Devices::Host
+#ifdef HAVE_CUDA
+   ,Devices::Cuda
+#endif
+   >;
+
+TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
+
+TYPED_TEST( ReduceTest, sum )
 {
    using Array = Containers::Array< int, Devices::Host >;
    Array a;
@@ -38,7 +57,7 @@ TEST( ReduceTest, sum )
    }
 }
 
-TEST( ReduceTest, min )
+TYPED_TEST( ReduceTest, min )
 {
    using Array = Containers::Array< int, Devices::Host >;
    Array a;
@@ -54,7 +73,7 @@ TEST( ReduceTest, min )
    }
 }
 
-TEST( ReduceTest, max )
+TYPED_TEST( ReduceTest, max )
 {
    using Array = Containers::Array< int, Devices::Host >;
    Array a;
@@ -71,7 +90,7 @@ TEST( ReduceTest, max )
 }
 
 
-TEST( ReduceTest, logicalAnd )
+TYPED_TEST( ReduceTest, logicalAnd )
 {
    using Array = Containers::Array< bool, Devices::Host >;
    Array a;
@@ -87,7 +106,7 @@ TEST( ReduceTest, logicalAnd )
    }
 }
 
-TEST( ReduceTest, logicalOr )
+TYPED_TEST( ReduceTest, logicalOr )
 {
    using Array = Containers::Array< bool, Devices::Host >;
    Array a;
@@ -103,7 +122,7 @@ TEST( ReduceTest, logicalOr )
    }
 }
 
-TEST( ReduceTest, bitAnd )
+TYPED_TEST( ReduceTest, bitAnd )
 {
    using Array = Containers::Array< char, Devices::Host >;
    Array a;
@@ -119,7 +138,7 @@ TEST( ReduceTest, bitAnd )
    }
 }
 
-TEST( ReduceTest, bitOr )
+TYPED_TEST( ReduceTest, bitOr )
 {
    using Array = Containers::Array< char, Devices::Host >;
    Array a;
-- 
GitLab


From 78e3c099c6d3f024812581cd5039db4a387abd1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 13:25:38 +0200
Subject: [PATCH 05/34] Added documentation to Functional.h

---
 src/TNL/Functional.h | 111 +++++++++++++++++++++++++++++++------------
 1 file changed, 81 insertions(+), 30 deletions(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 6a40088bb..7d9ce640a 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -12,35 +12,11 @@
 
 namespace TNL {
 
-/*template< typename Value,
-          int size = sizeof( Value ) >
-struct AllBitsTrue
-{
-   static constexpr Value aux = AllBitsTrue< Value, size - 1 >::value << 8;
-   static constexpr Value value = ( Value ) aux | 0xff;
-};
-
-template< typename Value >
-struct AllBitsTrue< Value, 1 >
-{
-   static constexpr Value value = ( Value ) 0xff;
-};
-
-template< typename Value,
-          int size = sizeof( Value ) >
-struct AllBitsFalse
-{
-   static constexpr Value aux = AllBitsFalse< Value, size - 1 >::value << 8;
-   static constexpr Value value = ( Value ) aux | 0x00;
-};
-
-template< typename Value >
-struct AllBitsFalse< Value, 1 >
-{
-   static constexpr Value value = ( Value ) 0x00;
-};*/
-
-
+/**
+ * \brief Replacement of std::plus which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct Plus
 {
@@ -51,6 +27,11 @@ struct Plus
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs + rhs; }
 };
 
+/**
+ * \brief Replacement of std::plus which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct Plus< void >
 {
@@ -61,6 +42,11 @@ struct Plus< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs + rhs; }
 };
 
+/**
+ * \brief Replacement of std::multiplies which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct Multiplies
 {
@@ -71,6 +57,11 @@ struct Multiplies
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs * rhs; }
 };
 
+/**
+ * \brief Replacement of std::multiplies which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct Multiplies< void >
 {
@@ -82,6 +73,11 @@ struct Multiplies< void >
 };
 
 
+/**
+ * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct Min
 {
@@ -92,6 +88,11 @@ struct Min
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs < rhs ? lhs : rhs; }
 };
 
+/**
+ * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct Min< void >
 {
@@ -103,6 +104,11 @@ struct Min< void >
 };
 
 
+/**
+ * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct Max
 {
@@ -113,6 +119,11 @@ struct Max
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs > rhs ? lhs : rhs; }
 };
 
+/**
+ * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct Max< void >
 {
@@ -123,6 +134,11 @@ struct Max< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs > rhs ? lhs : rhs; }
 };
 
+/**
+ * \brief Replacement of std::logical_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct LogicalAnd
 {
@@ -133,6 +149,11 @@ struct LogicalAnd
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs && rhs; }
 };
 
+/**
+ * \brief Replacement of std::logical_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct LogicalAnd< void >
 {
@@ -143,6 +164,11 @@ struct LogicalAnd< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs && rhs; }
 };
 
+/**
+ * \brief Replacement of std::logical_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct LogicalOr
 {
@@ -152,6 +178,12 @@ struct LogicalOr
 
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs || rhs; }
 };
+
+/**
+ * \brief Replacement of std::logical_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct LogicalOr< void >
 {
@@ -163,6 +195,11 @@ struct LogicalOr< void >
 };
 
 
+/**
+ * \brief Replacement of std::bit_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct BitAnd
 {
@@ -173,6 +210,11 @@ struct BitAnd
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs & rhs; }
 };
 
+/**
+ * \brief Replacement of std::bit_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct BitAnd< void >
 {
@@ -183,6 +225,11 @@ struct BitAnd< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs & rhs; }
 };
 
+/**
+ * \brief Replacement of std::bit_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * \tparam Value is data type.
+ */
 template< typename Value = void >
 struct BitOr
 {
@@ -193,6 +240,11 @@ struct BitOr
    constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs | rhs; }
 };
 
+/**
+ * \brief Replacement of std::bit_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
 template<>
 struct BitOr< void >
 {
@@ -203,5 +255,4 @@ struct BitOr< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs | rhs; }
 };
 
-
 } // namespace TNL
-- 
GitLab


From f44bbb719ec8a4508673f5845314137a23d8e273 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 15:11:43 +0200
Subject: [PATCH 06/34] Added functionals with argument.

---
 src/TNL/Functional.h | 105 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 7d9ce640a..e5113e90a 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -72,7 +72,6 @@ struct Multiplies< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs * rhs; }
 };
 
-
 /**
  * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduce.
  *
@@ -134,6 +133,110 @@ struct Max< void >
    constexpr T operator()( const T& lhs, const T& rhs ) { return lhs > rhs ? lhs : rhs; }
 };
 
+/**
+ * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * \tparam Value is data type.
+ */
+template< typename Value = void, typename Index = void >
+struct MinWithArg
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
+
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   {
+      if( lhs > rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
+/**
+ * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+template<>
+struct MinWithArg< void, void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
+
+   template< typename Value, typename Index >
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   {
+      if( lhs > rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
+/**
+ * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * \tparam Value is data type.
+ */
+template< typename Value = void, typename Index = void >
+struct MaxWithArg
+{
+   using ValueType = Value;
+
+   static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
+
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   {
+      if( lhs < rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
+/**
+ * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ *
+ * This is specialization for void type. The real type is deduced just when operator() is evoked.
+ */
+template<>
+struct MaxWithArg< void, void >
+{
+   template< typename T >
+   static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
+
+   template< typename Value, typename Index >
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   {
+      if( lhs < rhs )
+      {
+         lhs = rhs;
+         lhsIdx = rhsIdx;
+      }
+      else if( lhs == rhs && rhsIdx < lhsIdx )
+      {
+         lhsIdx = rhsIdx;
+      }
+   }
+};
+
 /**
  * \brief Replacement of std::logical_and which is optimized for use with \ref TNL::Algorithms::reduce.
  *
-- 
GitLab


From 2dd27ada385c8ccd980491152c0aa786ba5fc5fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 15:12:03 +0200
Subject: [PATCH 07/34] Fixing reduction with argument.

---
 src/TNL/Algorithms/Reduction.h | 228 +++++++++++++++++++++++++++++++--
 1 file changed, 214 insertions(+), 14 deletions(-)

diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index f394ce86c..5993e52e7 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -41,6 +41,8 @@ struct Reduction;
 template<>
 struct Reduction< Devices::Sequential >
 {
+   using DeviceType = Devices::Sequential;
+
    /**
     * \brief Computes reduction on CPU sequentially.
     *
@@ -86,7 +88,7 @@ struct Reduction< Devices::Sequential >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< Result >() );
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 
    /**
     * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
@@ -138,12 +140,14 @@ struct Reduction< Devices::Sequential >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::idempotent );
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 };
 
 template<>
 struct Reduction< Devices::Host >
 {
+   using DeviceType = Devices::Host;
+
    /**
     * \brief Computes reduction on CPU.
     *
@@ -189,9 +193,9 @@ struct Reduction< Devices::Host >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero );
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 
-   template< typename Index,
+   /*template< typename Index,
              typename Fetch,
              typename Reduce_ >
    static auto
@@ -202,7 +206,7 @@ struct Reduction< Devices::Host >
    {
       using Result = decltype( fetch( ( Index ) 0 ) );
       return reduce( begin, end, fetch, reduce_, std::remove_reference< Reduce_ >::type::template getIdempotent< Result >() );
-   };
+   };*/
 
 
    /**
@@ -255,12 +259,14 @@ struct Reduction< Devices::Host >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::idempotent );
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 };
 
 template<>
 struct Reduction< Devices::Cuda >
 {
+   using DeviceType = Devices::Cuda;
+
    /**
     * \brief Computes reduction on GPU.
     *
@@ -306,7 +312,7 @@ struct Reduction< Devices::Cuda >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero = Reduce::idempotent );
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 
    /**
     * \brief Computes reduction on GPU and returns position of an element of interest.
@@ -358,9 +364,53 @@ struct Reduction< Devices::Cuda >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::idempotent );
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
 };
 
+/**
+ * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
+ *
+ * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
+ * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
+ * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
+ * position of the smallest or the largest element, reduction with argument can be used.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduce is a lambda function performing the reduction.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithLambda.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithLambda.out
+ */
 template< typename Device,
           typename Index,
           typename Result,
@@ -372,9 +422,42 @@ Result reduce( const Index begin,
                Reduce&& reduce,
                const Result& zero )
 {
-    return Reduction< Device >::reduce( begin, end, fetch, reduce, zero );
+    return Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
 }
 
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduce with functional instead of reduction lambda function.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ * \tparam Reduce is a functional performing the reduction.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduce can be one of the following \ref TNL::Plus, \ref TNL::Multiplies, \ref TNL::Min, \ref TNL::Max, \ref TNL::LogicalAnd,
+ *    \ref TNL::LogicalOr, \ref TNL::BitAnd or \ref TNL::BitOr.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation.
+ * \return result of the reduction
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/SumExampleWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include SumExampleWithFunctional.out
+ */
 template< typename Device,
           typename Index,
           typename Fetch,
@@ -382,12 +465,61 @@ template< typename Device,
 auto reduce( const Index begin,
              const Index end,
              Fetch&& fetch,
-             Reduce&& reduce ) -> decltype( Reduction< Device >::reduce( begin, end, fetch, reduce ) )
+             Reduce&& reduce ) -> decltype( Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ),
+                                                   std::remove_reference< Reduce >::type::template getIdempotent< decltype( fetch( ( Index ) 0 ) ) >() ) )
 {
-   return Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ) );
+   using Result = decltype( fetch( ( Index ) 0 ) );
+   return Reduction< Device >::reduce( begin,
+                                       end,
+                                       std::forward< Fetch >( fetch ),
+                                       std::forward< Reduce >( reduce ),
+                                       std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
 }
 
-
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduce returning also a position of an element of interest.
+ *
+ * For example in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. The use of this method
+ * is, however, more flexible.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduce is a lambda function performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgument.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgument.out
+ */
 template< typename Device,
           typename Index,
           typename Result,
@@ -398,9 +530,77 @@ reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
                     Reduce&& reduce,
-                    const Result& zero = Reduce::template getIdempotent< Result >() )
+                    const Result& zero )
+{
+    return Reduction< Device >::reduceWithArgument( begin,
+                                                    end,
+                                                    std::forward< Fetch >( fetch ),
+                                                    std::forward< Reduce >( reduce ),
+                                                    zero );
+}
+
+/**
+ * \brief Variant of \ref TNL::Algorithms::reduceWithArgument with functional instead of reduction lambda function.
+ *
+ * For example in case of computing minimal or maximal element in array/vector,
+ * the position of the element having given value can be obtained. The use of this method
+ * is, however, more flexible.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ * \tparam Index is a type for indexing.
+ * \tparam Result is a type of the reduction result.
+ * \tparam Reduce is a functional performing the reduction.
+ * \tparam Fetch is a lambda function for fetching the input data.
+ *
+ * \e Device can be on of the following \ref TNL::Devices::Sequential, \ref TNL::Devices::Host and \ref TNL::Devices::Cuda.
+ *
+ * \e Reduce can be one of \ref TNL::MinWithArg, \ref TNL::MaxWithArg.
+ * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+ * \param end defines range [begin, end) of indexes which will be used for the reduction.
+ * \param fetch is a lambda function fetching the input data.
+ * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+ * \param zero is the idempotent element for the reduction operation, i.e. element which
+ *             does not change the result of the reduction.
+ * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+ *         is the element position and `pair.second` is the reduction result.
+ *
+ * The `fetch` lambda function takes one argument which is index of the element to be fetched:
+ *
+ * ```
+ * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
+ * ```
+ *
+ * The `reduce` lambda function takes two variables which are supposed to be reduced:
+ *
+ * ```
+ * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
+ * ```
+ *
+ * \par Example
+ *
+ * \include ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+ *
+ * \par Output
+ *
+ * \include ReductionWithArgumentWithFunctional.out
+ */
+template< typename Device,
+          typename Index,
+          typename Fetch,
+          typename Reduce >
+auto
+reduceWithArgument( const Index begin,
+                    const Index end,
+                    Fetch&& fetch,
+                    Reduce&& reduce ) -> decltype( Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce,
+                                                   std::remove_reference< Reduce >::type::template getIdempotent<  decltype( fetch( ( Index ) 0 ) ) >() ) )
 {
-    return Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce, zero );
+   using Result = decltype( fetch( ( Index ) 0 ) );
+   return Reduction< Device >::reduceWithArgument( begin,
+                                                   end,
+                                                   std::forward< Fetch >( fetch ),
+                                                   std::forward< Reduce >( reduce ),
+                                                   std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
 }
 
 
-- 
GitLab


From 97600867db0e1ed44b5578bebbb14f42e2c17612 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 15:18:06 +0200
Subject: [PATCH 08/34] Added unit tests for reduction with argument.

---
 src/UnitTests/Algorithms/ReductionTest.h | 34 ++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
index 6eb32ba47..e2d573e50 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -89,6 +89,40 @@ TYPED_TEST( ReduceTest, max )
    }
 }
 
+TYPED_TEST( ReduceTest, minWithArg )
+{
+   using Array = Containers::Array< int, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduceWithArgument< Devices::Host >( ( int ) 0, size, fetch, TNL::MinWithArg<>{} );
+      EXPECT_EQ( res.first, 1 );
+      EXPECT_EQ( res.second, 0 );
+   }
+}
+
+TYPED_TEST( ReduceTest, maxWithArg )
+{
+   using Array = Containers::Array< int, Devices::Host >;
+   Array a;
+   for( int size = 100; size <= 1000000; size *= 10 )
+   {
+      a.setSize( size );
+      a.forAllElements( [] __cuda_callable__ ( int idx, int& value ) { value = idx + 1;} );
+      auto a_view = a.getView();
+
+      auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
+      auto res = Algorithms::reduceWithArgument< Devices::Host >( ( int ) 0, size, fetch, TNL::MaxWithArg<>{} );
+      EXPECT_EQ( res.first, size );
+      EXPECT_EQ( res.second, size - 1 );
+   }
+}
+
 
 TYPED_TEST( ReduceTest, logicalAnd )
 {
-- 
GitLab


From 5bfe20f0e817485fd0e683024968f1e69104ff05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 17:10:40 +0200
Subject: [PATCH 09/34] Added examples on rudection with functional.

---
 .../Tutorials/ReductionAndScan/CMakeLists.txt | 89 ++++++++++---------
 .../ReductionWithArgumentWithFunctional.cpp   | 36 ++++++++
 .../ReductionWithArgumentWithFunctional.cu    |  1 +
 .../SumExampleWithFunctional.cpp              | 51 +++++++++++
 .../SumExampleWithFunctional.cu               |  1 +
 5 files changed, 135 insertions(+), 43 deletions(-)
 create mode 100644 Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu
 create mode 100644 Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu

diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 92686e17b..547a55574 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -1,45 +1,48 @@
-IF( BUILD_CUDA )
-   CUDA_ADD_EXECUTABLE( SumExample SumExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SumExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SumExample.out OUTPUT SumExample.out )
-   CUDA_ADD_EXECUTABLE( ProductExample ProductExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ProductExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ProductExample.out OUTPUT ProductExample.out )
-   CUDA_ADD_EXECUTABLE( ScalarProductExample ScalarProductExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ScalarProductExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ScalarProductExample.out OUTPUT ScalarProductExample.out )
-   CUDA_ADD_EXECUTABLE( MaximumNormExample MaximumNormExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MaximumNormExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MaximumNormExample.out OUTPUT MaximumNormExample.out )
-   CUDA_ADD_EXECUTABLE( ComparisonExample ComparisonExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ComparisonExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ComparisonExample.out OUTPUT ComparisonExample.out )
-   CUDA_ADD_EXECUTABLE( UpdateAndResidueExample UpdateAndResidueExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND UpdateAndResidueExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/UpdateAndResidueExample.out OUTPUT UpdateAndResidueExample.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-1 MapReduceExample-1.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-1 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-1.out OUTPUT MapReduceExample-1.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-2 MapReduceExample-2.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-2 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-2.out OUTPUT MapReduceExample-2.out )
-   CUDA_ADD_EXECUTABLE( MapReduceExample-3 MapReduceExample-3.cu )
-   ADD_CUSTOM_COMMAND( COMMAND MapReduceExample-3 > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/MapReduceExample-3.out OUTPUT MapReduceExample-3.out )
-   CUDA_ADD_EXECUTABLE( ReductionWithArgument ReductionWithArgument.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ReductionWithArgument > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ReductionWithArgument.out OUTPUT ReductionWithArgument.out )
-   CUDA_ADD_EXECUTABLE( ScanExample ScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ScanExample.out OUTPUT ScanExample.out )
-   CUDA_ADD_EXECUTABLE( ExclusiveScanExample ExclusiveScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND ExclusiveScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/ExclusiveScanExample.out OUTPUT ExclusiveScanExample.out )
-   CUDA_ADD_EXECUTABLE( SegmentedScanExample SegmentedScanExample.cu )
-   ADD_CUSTOM_COMMAND( COMMAND SegmentedScanExample > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/SegmentedScanExample.out OUTPUT SegmentedScanExample.out )
-ENDIF()
+set( COMMON_EXAMPLES
+     SumExample
+     SumExampleWithFunctional
+     ProductExample
+     ScalarProductExample
+     MaximumNormExample
+     ComparisonExample
+     UpdateAndResidueExample
+     MapReduceExample-1
+     MapReduceExample-2
+     MapReduceExample-3
+     ReductionWithArgument
+     ReductionWithArgumentWithFunctional
+     ScanExample
+     ExclusiveScanExample
+     SegmentedScanExample
+)
+
+if( BUILD_CUDA )
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+      #add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+   endforeach()
+else()
+   foreach( target IN ITEMS ${COMMON_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+   foreach( target IN ITEMS ${LONG_EXAMPLES} )
+      add_executable( ${target} ${target}.cpp )
+      #add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+      #set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
+   endforeach()
+endif()
+
 
 IF( BUILD_CUDA )
-ADD_CUSTOM_TARGET( TutorialsReduction-cuda ALL DEPENDS
-   SumExample.out
-   ProductExample.out
-   ScalarProductExample.out
-   MaximumNormExample.out
-   ComparisonExample.out
-   UpdateAndResidueExample.out
-   MapReduceExample-1.out
-   MapReduceExample-2.out
-   MapReduceExample-3.out
-   ReductionWithArgument.out
-   ScanExample.out
-   ExclusiveScanExample.out
-   SegmentedScanExample.out )
-ENDIF()
+   ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples-cuda ALL DEPENDS ${CUDA_OUTPUTS} )
+ELSE()
+   ADD_CUSTOM_TARGET( RunTutorialsReductionAndScanExamples ALL DEPENDS ${HOST_OUTPUTS} )
+ENDIF()
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
new file mode 100644
index 000000000..f2fa6208f
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
@@ -0,0 +1,36 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+std::pair< double, int >
+maximumNorm( const Vector< double, Device >& v )
+{
+   auto view = v.getConstView();
+
+   auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
+   return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg<>{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   Vector< double, Devices::Host > host_v( 10 );
+   host_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = i - 7; } );
+   std::cout << "host_v = " << host_v << std::endl;
+   auto maxNormHost = maximumNorm( host_v );
+   std::cout << "The maximum norm of the host vector elements is " <<  maxNormHost.first << " at position " << maxNormHost.second << "." << std::endl;
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   cuda_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = i - 7; } );
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   auto maxNormCuda = maximumNorm( cuda_v );
+   std::cout << "The maximum norm of the device vector elements is " <<  maxNormCuda.first << " at position " << maxNormCuda.second << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu
new file mode 120000
index 000000000..a546b6339
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cu
@@ -0,0 +1 @@
+ReductionWithArgumentWithFunctional.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
new file mode 100644
index 000000000..b68ad0a29
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
@@ -0,0 +1,51 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+double sum( const Vector< double, Device >& v )
+{
+   /****
+    * Get vector view which can be captured by lambda.
+    */
+   auto view = v.getConstView();
+
+   /****
+    * The fetch function just reads elements of vector v.
+    */
+   auto fetch = [=] __cuda_callable__ ( int i ) -> double { return view[ i ]; };
+
+   /***
+    * Finally we call the templated function Reduction and pass number of elements to reduce,
+    * lambda defined above and functional representing the reduction operation.
+    */
+   return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus<>{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * Firstly, test the sum with vectors allocated on CPU.
+    */
+   Vector< double, Devices::Host > host_v( 10 );
+   host_v = 1.0;
+   std::cout << "host_v = " << host_v << std::endl;
+   std::cout << "The sum of the host vector elements is " << sum( host_v ) << "." << std::endl;
+
+   /***
+    * And then also on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_v( 10 );
+   cuda_v = 1.0;
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   std::cout << "The sum of the CUDA vector elements is " << sum( cuda_v ) << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu
new file mode 120000
index 000000000..c95dde139
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cu
@@ -0,0 +1 @@
+SumExampleWithFunctional.cpp
\ No newline at end of file
-- 
GitLab


From 29adf8362ce9160c184ea96ba3f51769f6936f24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 20:12:35 +0200
Subject: [PATCH 10/34] Moving Reduction to Algorithms::detail.

---
 .../DenseMatrixExample_getConstRow.cpp        |   2 +-
 .../DenseMatrixViewExample_getConstRow.cpp    |   2 +-
 ...MultidiagonalMatrixExample_getConstRow.cpp |   2 +-
 ...idiagonalMatrixViewExample_getConstRow.cpp |   2 +-
 .../SparseMatrixExample_getConstRow.cpp       |   2 +-
 .../SparseMatrixViewExample_getConstRow.cpp   |   2 +-
 .../TridiagonalMatrixExample_getConstRow.cpp  |   2 +-
 ...idiagonalMatrixViewExample_getConstRow.cpp |   2 +-
 ...orithms_and_lambda_functions_reduction.cpp |   2 +-
 ...ithms_and_lambda_functions_reduction_2.cpp |   2 +-
 .../ReductionAndScan/ComparisonExample.cpp    |   4 +-
 .../ReductionAndScan/MapReduceExample-1.cpp   |   4 +-
 .../ReductionAndScan/MapReduceExample-2.cpp   |   4 +-
 .../ReductionAndScan/MapReduceExample-3.cpp   |   4 +-
 .../ReductionAndScan/MaximumNormExample.cpp   |   4 +-
 .../ReductionAndScan/ProductExample.cpp       |   4 +-
 .../ReductionWithArgument.cpp                 |   2 +-
 .../ReductionAndScan/ScalarProductExample.cpp |   4 +-
 .../Tutorials/ReductionAndScan/SumExample.cpp |   4 +-
 .../UpdateAndResidueExample.cpp               |   4 +-
 .../tutorial_ReductionAndScan.md              |   2 +-
 .../BLAS/CommonVectorOperations.hpp           |  34 +-
 src/TNL/Algorithms/MemoryOperationsCuda.hpp   |   6 +-
 src/TNL/Algorithms/MemoryOperationsHost.hpp   |   6 +-
 src/TNL/Algorithms/Reduction.h                | 388 +-----------------
 src/TNL/Algorithms/detail/Reduction.h         | 244 +++++++++++
 src/TNL/Algorithms/{ => detail}/Reduction.hpp |   6 +-
 src/TNL/Containers/ArrayView.hpp              |   4 +-
 src/TNL/Containers/Expressions/Comparison.h   |  30 +-
 .../DistributedExpressionTemplates.h          |  12 +-
 .../DistributedVerticalOperations.h           |   4 +-
 .../Expressions/ExpressionTemplates.h         |  12 +-
 .../Expressions/VerticalOperations.h          |  20 +-
 src/TNL/Functional.h                          |   2 +
 src/TNL/Matrices/DenseMatrixView.hpp          |   2 +-
 src/TNL/Matrices/Matrix.hpp                   |   2 +-
 src/TNL/Matrices/MatrixView.hpp               |   2 +-
 src/TNL/Matrices/MultidiagonalMatrixView.hpp  |   2 +-
 src/TNL/Matrices/SparseMatrixView.hpp         |   4 +-
 src/TNL/Matrices/TridiagonalMatrixView.hpp    |   2 +-
 .../MeshDetails/layers/EntityTags/Layer.h     |   4 +-
 src/UnitTests/Matrices/DenseMatrixTest.h      |   2 +-
 42 files changed, 375 insertions(+), 473 deletions(-)
 create mode 100644 src/TNL/Algorithms/detail/Reduction.h
 rename src/TNL/Algorithms/{ => detail}/Reduction.hpp (99%)

diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
index 2e05b1678..a4c78f97b 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixExample_getConstRow.cpp
@@ -36,7 +36,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
index 5fc1195ec..17747e428 100644
--- a/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/DenseMatrix/DenseMatrixViewExample_getConstRow.cpp
@@ -29,7 +29,7 @@ void getRowExample()
       return row.getValue( rowIdx );
    };
 
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
index b8ebf9181..f5f662b89 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixExample_getConstRow.cpp
@@ -41,7 +41,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << *matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
index 346e331db..72d04c8ad 100644
--- a/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/MultidiagonalMatrix/MultidiagonalMatrixViewExample_getConstRow.cpp
@@ -32,7 +32,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
index 4d3ae4ff5..b13ead12c 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixExample_getConstRow.cpp
@@ -36,7 +36,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
index 2b5f0faed..85da6f5b3 100644
--- a/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/SparseMatrix/SparseMatrixViewExample_getConstRow.cpp
@@ -28,7 +28,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix trace is " << trace << "." << std::endl;
 }
 
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
index 30bf9249e..8b11bdb17 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixExample_getConstRow.cpp
@@ -40,7 +40,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, matrix->getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << *matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
index 20d55ff12..073fbc909 100644
--- a/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
+++ b/Documentation/Examples/Matrices/TridiagonalMatrix/TridiagonalMatrixViewExample_getConstRow.cpp
@@ -30,7 +30,7 @@ void getRowExample()
    /***
     * Compute the matrix trace.
     */
-   int trace = TNL::Algorithms::Reduction< Device >::reduce( 0, view.getRows(), fetch, std::plus<>{}, 0 );
+   int trace = TNL::Algorithms::reduce< Device >( 0, view.getRows(), fetch, std::plus<>{}, 0 );
    std::cout << "Matrix reads as: " << std::endl << matrix << std::endl;
    std::cout << "Matrix trace is: " << trace << "." << std::endl;
 }
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
index fda9a41b9..f6a54481a 100644
--- a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction.cpp
@@ -6,5 +6,5 @@ void scalarProduct( double* v1, double* v2, double* product, const int size )
     }
     auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
         return a + b; };
-    TNL::Algorithms::Reduction< Device >::reduce( 0, size, fetch, reduce, 0.0 );
+    TNL::Algorithms::reduce< Device >( 0, size, fetch, reduce, 0.0 );
 }
\ No newline at end of file
diff --git a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
index ef17140ce..7c7993e3a 100644
--- a/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
+++ b/Documentation/Tutorials/GeneralConcepts/snippet_algorithms_and_lambda_functions_reduction_2.cpp
@@ -8,5 +8,5 @@ void scalarProduct( double* u1, double* u2,
     }
     auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) {
         return a + b; };
-    TNL::Algorithms::Reduction< Device >::reduce( 0, size, fetch, reduce, 0.0 );
+    TNL::Algorithms::reduce< Device >( 0, size, fetch, reduce, 0.0 );
 }
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
index 931d07d2b..46d6e50a2 100644
--- a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
@@ -21,8 +21,8 @@ bool comparison( const Vector< double, Device >& u, const Vector< double, Device
    /***
     * Reduce performs logical AND on intermediate results obtained by fetch.
     */
-   auto reduce = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
-   return Reduction< Device >::reduce( 0, v_view.getSize(), fetch, reduce, true );
+   auto reduce_ = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
+   return reduce< Device >( 0, v_view.getSize(), fetch, reduce_, true );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
index 90a069c8a..ff02f9c86 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-1.cpp
@@ -13,8 +13,8 @@ double mapReduce( Vector< double, Device >& u )
    auto u_view = u.getView();
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       return u_view[ i ] > 0 ? u_view[ i ] : 0.0; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
index da7c1c9c6..da873ca6f 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
@@ -15,8 +15,8 @@ double mapReduce( Vector< double, Device >& u )
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       if( i % 2 == 0 ) return u_view[ i ];
       return 0.0; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
index 5b5f31131..cdd677f87 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
@@ -14,8 +14,8 @@ double mapReduce( Vector< double, Device >& u )
    auto u_view = u.getView();
    auto fetch = [=] __cuda_callable__ ( int i )->double {
       return u_view[ 2 * i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, u_view.getSize() / 2, fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, u_view.getSize() / 2, fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
index 8d503cbd4..c9a5926ad 100644
--- a/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MaximumNormExample.cpp
@@ -12,8 +12,8 @@ double maximumNorm( const Vector< double, Device >& v )
 {
    auto view = v.getConstView();
    auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return max( a, b ); };
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return max( a, b ); };
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
index 9df9a6e4b..e4cd58b5e 100644
--- a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
@@ -12,12 +12,12 @@ double product( const Vector< double, Device >& v )
 {
    auto view = v.getConstView();
    auto fetch = [=] __cuda_callable__ ( int i ) { return view[ i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
+   auto reduce_ = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
 
    /***
     * Since we compute the product of all elements, the reduction must be initialized by 1.0 not by 0.0.
     */
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 1.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduce_, 1.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
index 18ac3363b..79a82c733 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgument.cpp
@@ -22,7 +22,7 @@ maximumNorm( const Vector< double, Device >& v )
       else if( a == b && bIdx < aIdx )
          aIdx = bIdx;
    };
-   return Reduction< Device >::reduceWithArgument( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::max() );
+   return reduceWithArgument< Device >( 0, view.getSize(), fetch, reduction, std::numeric_limits< double >::max() );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
index 680075f84..2dd84aa03 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductExample.cpp
@@ -17,8 +17,8 @@ double scalarProduct( const Vector< double, Device >& u, const Vector< double, D
     * Fetch computes product of corresponding elements of both vectors.
     */
    auto fetch = [=] __cuda_callable__ ( int i ) { return u_view[ i ] * v_view[ i ]; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return Reduction< Device >::reduce( 0, v_view.getSize(), fetch, reduce, 0.0 );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return reduce< Device >( 0, v_view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
index 90c6f724a..8a60888b6 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
@@ -23,14 +23,14 @@ double sum( const Vector< double, Device >& v )
    /***
     * Reduction is sum of two numbers.
     */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   auto reduce_ = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
     * Finally we call the templated function Reduction and pass number of elements to reduce,
     * lambdas defined above and finally value of idempotent element, zero in this case, which serve for the
     * reduction initiation.
     */
-   return Reduction< Device >::reduce( 0, view.getSize(), fetch, reduce, 0.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduce_, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
index 8bd08e900..a2ccb8189 100644
--- a/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/UpdateAndResidueExample.cpp
@@ -16,8 +16,8 @@ double updateAndResidue( Vector< double, Device >& u, const Vector< double, Devi
       const double& add = delta_u_view[ i ];
       u_view[ i ] += tau * add;
       return add * add; };
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
-   return sqrt( Reduction< Device >::reduce( 0, u_view.getSize(), fetch, reduce, 0.0 ) );
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   return sqrt( reduce< Device >( 0, u_view.getSize(), fetch, reduction, 0.0 ) );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 32d663bd3..92e3ea620 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -34,7 +34,7 @@ Putting everything together gives the following example:
 
 Since TNL vectors cannot be pass to CUDA kernels and so they cannot be captured by CUDA lambdas, we must first get vector view from the vector using a method `getConstView()`.
 
-Note tha we pass `0.0` as the last argument of the method `Reduction< Device >::reduce`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
+Note tha we pass `0.0` as the last argument of the method `reduce< Device >`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
 
 \include SumExample.out
 
diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
index d6a459677..72c1f344d 100644
--- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp
+++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp
@@ -30,7 +30,7 @@ getVectorMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -47,7 +47,7 @@ getVectorMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -64,7 +64,7 @@ getVectorAbsMax( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -81,7 +81,7 @@ getVectorAbsMin( const Vector& v )
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -97,7 +97,7 @@ getVectorL1Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -113,7 +113,7 @@ getVectorL2Norm( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; };
-   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
+   return std::sqrt( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -136,7 +136,7 @@ getVectorLpNorm( const Vector& v,
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); };
-   return std::pow( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -155,7 +155,7 @@ getVectorSum( const Vector& v )
 
    const auto* data = v.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i )  -> ResultType { return data[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -175,7 +175,7 @@ getVectorDifferenceMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -195,7 +195,7 @@ getVectorDifferenceMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -215,7 +215,7 @@ getVectorDifferenceAbsMax( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Device >
@@ -235,7 +235,7 @@ getVectorDifferenceAbsMin( const Vector1& v1,
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
    auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Device >
@@ -254,7 +254,7 @@ getVectorDifferenceL1Norm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -276,7 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1,
       auto diff = data1[ i ] - data2[ i ];
       return diff * diff;
    };
-   return std::sqrt( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
+   return std::sqrt( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ) );
 }
 
 template< typename Device >
@@ -302,7 +302,7 @@ getVectorDifferenceLpNorm( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); };
-   return std::pow( Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
+   return std::pow( Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 ), 1.0 / p );
 }
 
 template< typename Device >
@@ -321,7 +321,7 @@ getVectorDifferenceSum( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 template< typename Device >
@@ -340,7 +340,7 @@ getScalarProduct( const Vector1& v1,
    const auto* data1 = v1.getData();
    const auto* data2 = v2.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, v1.getSize(),  fetch, std::plus<>{}, ( ResultType ) 0 );
 }
 
 } // namespace Benchmarks
diff --git a/src/TNL/Algorithms/MemoryOperationsCuda.hpp b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
index 545192dfa..626847eba 100644
--- a/src/TNL/Algorithms/MemoryOperationsCuda.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsCuda.hpp
@@ -182,7 +182,7 @@ compare( const Element1* destination,
    TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
 template< typename Element,
@@ -198,7 +198,7 @@ containsValue( const Element* data,
    TNL_ASSERT_GE( size, (Index) 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
 }
 
 template< typename Element,
@@ -214,7 +214,7 @@ containsOnlyValue( const Element* data,
    TNL_ASSERT_GE( size, 0, "" );
 
    auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; };
-   return Reduction< Devices::Cuda >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+   return reduce< Devices::Cuda >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
 }
 
 } // namespace Algorithms
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index 0034b8302..abebd9d15 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -161,7 +161,7 @@ compare( const DestinationElement* destination,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [destination, source] ( Index i ) -> bool { return destination[ i ] == source[ i ]; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
    }
    else {
       // sequential algorithm can return as soon as it finds a mismatch
@@ -183,7 +183,7 @@ containsValue( const Element* data,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [=] ( Index i ) -> bool { return data[ i ] == value; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_or<>{}, false );
    }
    else {
       // sequential algorithm can return as soon as it finds a match
@@ -205,7 +205,7 @@ containsOnlyValue( const Element* data,
 
    if( Devices::Host::isOMPEnabled() && Devices::Host::getMaxThreadsCount() > 1 ) {
       auto fetch = [data, value] ( Index i ) -> bool { return data[ i ] == value; };
-      return Reduction< Devices::Host >::reduce( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
+      return reduce< Devices::Host >( ( Index ) 0, size, fetch, std::logical_and<>{}, true );
    }
    else {
       // sequential algorithm can return as soon as it finds a mismatch
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index 5993e52e7..f90c6934d 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -19,353 +19,10 @@
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/Algorithms/detail/Reduction.h>
 
 namespace TNL {
-namespace Algorithms {
-
-/**
- * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
- *
- * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
- * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
- * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
- * position of the smallest or the largest element, reduction with argument can be used.
- *
- * \tparam Device parameter says on what device the reduction is gonna be performed.
- *
- * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
- */
-template< typename Device >
-struct Reduction;
-
-template<>
-struct Reduction< Devices::Sequential >
-{
-   using DeviceType = Devices::Sequential;
-
-   /**
-    * \brief Computes reduction on CPU sequentially.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static constexpr Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-
-   /**
-    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static constexpr std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-};
-
-template<>
-struct Reduction< Devices::Host >
-{
-   using DeviceType = Devices::Host;
-
-   /**
-    * \brief Computes reduction on CPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-
-   /*template< typename Index,
-             typename Fetch,
-             typename Reduce_ >
-   static auto
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce_&& reduce_ ) -> decltype( fetch( ( Index ) 0 ) )
-   {
-      using Result = decltype( fetch( ( Index ) 0 ) );
-      return reduce( begin, end, fetch, reduce_, std::remove_reference< Reduce_ >::type::template getIdempotent< Result >() );
-   };*/
-
-
-   /**
-    * \brief Computes reduction on CPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam ReductionOperation is a lambda function performing the reduction.
-    * \tparam DataFetcher is a lambda function for fetching the input data.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-};
-
-template<>
-struct Reduction< Devices::Cuda >
-{
-   using DeviceType = Devices::Cuda;
-
-   /**
-    * \brief Computes reduction on GPU.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/SumExample.cpp
-    *
-    * \par Output
-    *
-    * \include SumExample.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static Result
-   reduce( const Index begin,
-           const Index end,
-           Fetch&& fetch,
-           Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-
-   /**
-    * \brief Computes reduction on GPU and returns position of an element of interest.
-    *
-    * For example in case of computing minimal or maximal element in array/vector,
-    * the position of the element having given value can be obtained. The use of this method
-    * is, however, more flexible.
-    *
-    * \tparam Index is a type for indexing.
-    * \tparam Result is a type of the reduction result.
-    * \tparam Fetch is a lambda function for fetching the input data.
-    * \tparam Reduce is a lambda function performing the reduction.
-    *
-    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
-    * \param end defines range [begin, end) of indexes which will be used for the reduction.
-    * \param fetch is a lambda function fetching the input data.
-    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
-    * \param zero is the idempotent element for the reduction operation, i.e. element which
-    *             does not change the result of the reduction.
-    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
-    *         is the element position and `pair.second` is the reduction result.
-    *
-    * The `fetch` lambda function takes one argument which is index of the element to be fetched:
-    *
-    * ```
-    * auto fetch = [=] __cuda_callable__ ( Index i ) { return ... };
-    * ```
-    *
-    * The `reduce` lambda function takes two variables which are supposed to be reduced:
-    *
-    * ```
-    * auto reduce = [] __cuda_callable__ ( const Result& a, const Result& b, Index& aIdx, const Index& bIdx ) { return ... };
-    * ```
-    *
-    * \par Example
-    *
-    * \include ReductionAndScan/ReductionWithArgument.cpp
-    *
-    * \par Output
-    *
-    * \include ReductionWithArgument.out
-    */
-   template< typename Index,
-             typename Result,
-             typename Fetch,
-             typename Reduce >
-   static std::pair< Result, Index >
-   reduceWithArgument( const Index begin,
-                       const Index end,
-                       Fetch&& fetch,
-                       Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
-};
+   namespace Algorithms {
 
 /**
  * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
@@ -422,7 +79,7 @@ Result reduce( const Index begin,
                Reduce&& reduce,
                const Result& zero )
 {
-    return Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
+    return detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ), zero );
 }
 
 /**
@@ -465,15 +122,15 @@ template< typename Device,
 auto reduce( const Index begin,
              const Index end,
              Fetch&& fetch,
-             Reduce&& reduce ) -> decltype( Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ),
+             Reduce&& reduce ) -> decltype( detail::Reduction< Device >::reduce( begin, end, std::forward< Fetch >( fetch ), std::forward< Reduce >( reduce ),
                                                    std::remove_reference< Reduce >::type::template getIdempotent< decltype( fetch( ( Index ) 0 ) ) >() ) )
 {
    using Result = decltype( fetch( ( Index ) 0 ) );
-   return Reduction< Device >::reduce( begin,
-                                       end,
-                                       std::forward< Fetch >( fetch ),
-                                       std::forward< Reduce >( reduce ),
-                                       std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
+   return detail::Reduction< Device >::reduce( begin,
+                                               end,
+                                               std::forward< Fetch >( fetch ),
+                                               std::forward< Reduce >( reduce ),
+                                               std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
 }
 
 /**
@@ -532,11 +189,11 @@ reduceWithArgument( const Index begin,
                     Reduce&& reduce,
                     const Result& zero )
 {
-    return Reduction< Device >::reduceWithArgument( begin,
-                                                    end,
-                                                    std::forward< Fetch >( fetch ),
-                                                    std::forward< Reduce >( reduce ),
-                                                    zero );
+    return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                            end,
+                                                            std::forward< Fetch >( fetch ),
+                                                            std::forward< Reduce >( reduce ),
+                                                            zero );
 }
 
 /**
@@ -592,19 +249,16 @@ auto
 reduceWithArgument( const Index begin,
                     const Index end,
                     Fetch&& fetch,
-                    Reduce&& reduce ) -> decltype( Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce,
+                    Reduce&& reduce ) -> decltype( detail::Reduction< Device >::reduceWithArgument( begin, end, fetch, reduce,
                                                    std::remove_reference< Reduce >::type::template getIdempotent<  decltype( fetch( ( Index ) 0 ) ) >() ) )
 {
    using Result = decltype( fetch( ( Index ) 0 ) );
-   return Reduction< Device >::reduceWithArgument( begin,
-                                                   end,
-                                                   std::forward< Fetch >( fetch ),
-                                                   std::forward< Reduce >( reduce ),
-                                                   std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
+   return detail::Reduction< Device >::reduceWithArgument( begin,
+                                                           end,
+                                                           std::forward< Fetch >( fetch ),
+                                                           std::forward< Reduce >( reduce ),
+                                                           std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
 }
 
-
-} // namespace Algorithms
+   } // namespace Algorithms
 } // namespace TNL
-
-#include <TNL/Algorithms/Reduction.hpp>
diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
new file mode 100644
index 000000000..6ebef402d
--- /dev/null
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -0,0 +1,244 @@
+/***************************************************************************
+                          Reduction.h  -  description
+                             -------------------
+    begin                : Jul 5, 2021
+    copyright            : (C) 2021 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+// Implemented by: Tomas Oberhuber, Jakub Klinkovsky
+
+#pragma once
+
+#include <utility>            // std::pair
+#include <functional>         // reduction functions like std::plus, std::logical_and, std::logical_or etc. - deprecated
+
+#include <TNL/Functional.h>   // replacement of STL functional
+#include <TNL/Devices/Sequential.h>
+#include <TNL/Devices/Host.h>
+#include <TNL/Devices/Cuda.h>
+
+namespace TNL {
+   namespace Algorithms {
+      namespace detail {
+
+/**
+ * \brief Reduction implements [(parallel) reduction](https://en.wikipedia.org/wiki/Reduce_(parallel_pattern)) for vectors and arrays.
+ *
+ * Reduction can be used for operations having one or more vectors (or arrays) elements is input and returning
+ * one number (or element) as output. Some examples of such operations can be vectors/arrays comparison,
+ * vector norm, scalar product of two vectors or computing minimum or maximum. If one needs to know even
+ * position of the smallest or the largest element, reduction with argument can be used.
+ *
+ * \tparam Device parameter says on what device the reduction is gonna be performed.
+ *
+ * See \ref Reduction< Devices::Host > and \ref Reduction< Devices::Cuda >.
+ */
+template< typename Device >
+struct Reduction;
+
+template<>
+struct Reduction< Devices::Sequential >
+{
+   using DeviceType = Devices::Sequential;
+
+   /**
+    * \brief Computes reduction on CPU sequentially.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static constexpr Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+
+   /**
+    * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static constexpr std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+};
+
+template<>
+struct Reduction< Devices::Host >
+{
+   using DeviceType = Devices::Host;
+
+   /**
+    * \brief Computes reduction on CPU.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+
+   /**
+    * \brief Computes reduction on CPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam ReductionOperation is a lambda function performing the reduction.
+    * \tparam DataFetcher is a lambda function for fetching the input data.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+};
+
+template<>
+struct Reduction< Devices::Cuda >
+{
+   using DeviceType = Devices::Cuda;
+
+   /**
+    * \brief Computes reduction on GPU.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static Result
+   reduce( const Index begin,
+           const Index end,
+           Fetch&& fetch,
+           Reduce&& reduce,
+           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+
+   /**
+    * \brief Computes reduction on GPU and returns position of an element of interest.
+    *
+    * For example in case of computing minimal or maximal element in array/vector,
+    * the position of the element having given value can be obtained. The use of this method
+    * is, however, more flexible.
+    *
+    * \tparam Index is a type for indexing.
+    * \tparam Result is a type of the reduction result.
+    * \tparam Fetch is a lambda function for fetching the input data.
+    * \tparam Reduce is a lambda function performing the reduction.
+    *
+    * \param begin defines range [begin, end) of indexes which will be used for the reduction.
+    * \param end defines range [begin, end) of indexes which will be used for the reduction.
+    * \param fetch is a lambda function fetching the input data.
+    * \param reduce is a lambda function defining the reduction operation and managing the elements positions.
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    * \return result of the reduction in a form of std::pair< Index, Result> structure. `pair.first'
+    *         is the element position and `pair.second` is the reduction result.
+    *
+    */
+   template< typename Index,
+             typename Result,
+             typename Fetch,
+             typename Reduce >
+   static std::pair< Result, Index >
+   reduceWithArgument( const Index begin,
+                       const Index end,
+                       Fetch&& fetch,
+                       Reduce&& reduce,
+                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+};
+
+      } // namespace detail
+   } // namespace Algorithms
+} // namespace TNL
+
+#include <TNL/Algorithms/detail/Reduction.hpp>
\ No newline at end of file
diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/detail/Reduction.hpp
similarity index 99%
rename from src/TNL/Algorithms/Reduction.hpp
rename to src/TNL/Algorithms/detail/Reduction.hpp
index d7602f9de..0d1c8231f 100644
--- a/src/TNL/Algorithms/Reduction.hpp
+++ b/src/TNL/Algorithms/detail/Reduction.hpp
@@ -26,7 +26,8 @@
 #endif
 
 namespace TNL {
-namespace Algorithms {
+   namespace Algorithms {
+      namespace detail {
 
 /****
  * Arrays smaller than the following constant
@@ -499,5 +500,6 @@ reduceWithArgument( const Index begin,
    }
 }
 
-} // namespace Algorithms
+      } // namespace detail
+   } // namespace Algorithms
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 753e737e3..bb4105381 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -389,7 +389,7 @@ reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce,
 
    ValueType* d = this->getData();
    auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( begin, end, main_fetch, reduce, zero );
+   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
 }
 
 template< typename Value,
@@ -407,7 +407,7 @@ reduceElements( IndexType begin, IndexType end, Fetch&& fetch, Reduce&& reduce,
 
    const ValueType* d = this->getData();
    auto main_fetch = [=] __cuda_callable__ ( IndexType i ) mutable -> Result { return fetch( i, d[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( begin, end, main_fetch, reduce, zero );
+   return Algorithms::reduce< DeviceType >( begin, end, main_fetch, reduce, zero );
 }
 
 template< typename Value,
diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h
index 738409cc4..65f299120 100644
--- a/src/TNL/Containers/Expressions/Comparison.h
+++ b/src/TNL/Containers/Expressions/Comparison.h
@@ -68,7 +68,7 @@ struct VectorComparison< T1, T2, false >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] == view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -100,7 +100,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] > view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -115,7 +115,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] >= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -130,7 +130,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] < view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -145,7 +145,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable >
       const auto view_a = a.getConstView();
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] <= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -162,7 +162,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -177,7 +177,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -187,7 +187,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -197,7 +197,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -207,7 +207,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable >
 
       const auto view_b = b.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= view_b[ i ]; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, b.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
@@ -224,7 +224,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] == b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool NE( const T1& a, const T2& b )
@@ -239,7 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] > b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool GE( const T1& a, const T2& b )
@@ -249,7 +249,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] >= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LT( const T1& a, const T2& b )
@@ -259,7 +259,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] < b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 
    static bool LE( const T1& a, const T2& b )
@@ -269,7 +269,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable >
 
       const auto view_a = a.getConstView();
       auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return view_a[ i ] <= b; };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, a.getSize(), fetch, std::logical_and<>{}, true );
    }
 };
 
diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
index 6959a95fe..e257399f6 100644
--- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h
@@ -1073,7 +1073,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1092,7 +1092,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -1118,7 +1118,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1141,7 +1141,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -1167,7 +1167,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1190,7 +1190,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( lhs.getSize(), fetch, reduction, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
index f1b380435..e1f850013 100644
--- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
+++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h
@@ -70,7 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( (IndexType) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::max() );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
@@ -129,7 +129,7 @@ auto DistributedExpressionArgMax( const Expression& expression )
          else if( a == b && bIdx < aIdx )
             aIdx = bIdx;
       };
-      result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
+      result = Algorithms::reduceWithArgument< Devices::Host >( ( IndexType ) 0, (IndexType) nproc, fetch, reduction, std::numeric_limits< RealType >::lowest() );
       result.second = gatheredResults[ result.second ].second;
    }
    return result;
diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h
index 93d7e802d..11b06e822 100644
--- a/src/TNL/Containers/Expressions/ExpressionTemplates.h
+++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h
@@ -896,7 +896,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -915,7 +915,7 @@ Result evaluateAndReduce( Vector& lhs,
 
    RealType* lhs_data = lhs.getData();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -941,7 +941,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -964,7 +964,7 @@ Result addAndReduce( Vector& lhs,
       lhs_data[ i ] += aux;
       return aux;
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 ////
@@ -990,7 +990,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 template< typename Vector,
@@ -1013,7 +1013,7 @@ Result addAndReduceAbs( Vector& lhs,
       lhs_data[ i ] += aux;
       return TNL::abs( aux );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, lhs.getSize(), fetch, reduction, zero );
 }
 
 } // namespace TNL
diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h
index 6e5f5624b..ff094e4ea 100644
--- a/src/TNL/Containers/Expressions/VerticalOperations.h
+++ b/src/TNL/Containers/Expressions/VerticalOperations.h
@@ -43,7 +43,7 @@ auto ExpressionMin( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -65,7 +65,7 @@ auto ExpressionArgMin( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -85,7 +85,7 @@ auto ExpressionMax( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -107,7 +107,7 @@ auto ExpressionArgMax( const Expression& expression )
    };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
+   return Algorithms::reduceWithArgument< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, reduction, std::numeric_limits< ResultType >::lowest() );
 }
 
 template< typename Expression >
@@ -119,7 +119,7 @@ auto ExpressionSum( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::plus<>{}, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -131,7 +131,7 @@ auto ExpressionProduct( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::multiplies<>{}, (ResultType) 1 );
 }
 
 template< typename Expression >
@@ -145,7 +145,7 @@ auto ExpressionLogicalAnd( const Expression& expression )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_and<>{}, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -157,7 +157,7 @@ auto ExpressionLogicalOr( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::logical_or<>{}, (ResultType) 0 );
 }
 
 template< typename Expression >
@@ -171,7 +171,7 @@ auto ExpressionBinaryAnd( const Expression& expression )
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
    static_assert( std::numeric_limits< ResultType >::is_specialized,
                   "std::numeric_limits is not specialized for the reduction's result type" );
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_and<>{}, std::numeric_limits< ResultType >::max() );
 }
 
 template< typename Expression >
@@ -183,7 +183,7 @@ auto ExpressionBinaryOr( const Expression& expression )
 
    const auto view = expression.getConstView();
    auto fetch = [=] __cuda_callable__ ( IndexType i ) { return view[ i ]; };
-   return Algorithms::Reduction< typename Expression::DeviceType >::reduce( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
+   return Algorithms::reduce< typename Expression::DeviceType >( ( IndexType ) 0, expression.getSize(), fetch, std::bit_or<>{}, (ResultType) 0 );
 }
 
 } // namespace Expressions
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index e5113e90a..b7a3448b8 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -8,6 +8,8 @@
 
 /* See Copyright Notice in tnl/Copyright */
 
+#pragma once
+
 #include <limits>
 
 namespace TNL {
diff --git a/src/TNL/Matrices/DenseMatrixView.hpp b/src/TNL/Matrices/DenseMatrixView.hpp
index 0bf262aa0..4a999d76b 100644
--- a/src/TNL/Matrices/DenseMatrixView.hpp
+++ b/src/TNL/Matrices/DenseMatrixView.hpp
@@ -168,7 +168,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/Matrix.hpp b/src/TNL/Matrices/Matrix.hpp
index d057a3ecf..7e9a3722b 100644
--- a/src/TNL/Matrices/Matrix.hpp
+++ b/src/TNL/Matrices/Matrix.hpp
@@ -85,7 +85,7 @@ Index Matrix< Real, Device, Index, RealAllocator >::getNonzeroElementsCount() co
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MatrixView.hpp b/src/TNL/Matrices/MatrixView.hpp
index 83563a825..287305b5c 100644
--- a/src/TNL/Matrices/MatrixView.hpp
+++ b/src/TNL/Matrices/MatrixView.hpp
@@ -63,7 +63,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/MultidiagonalMatrixView.hpp b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
index 7dadde222..2b83fc87b 100644
--- a/src/TNL/Matrices/MultidiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/MultidiagonalMatrixView.hpp
@@ -173,7 +173,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/SparseMatrixView.hpp b/src/TNL/Matrices/SparseMatrixView.hpp
index 593c100a9..02ef757c2 100644
--- a/src/TNL/Matrices/SparseMatrixView.hpp
+++ b/src/TNL/Matrices/SparseMatrixView.hpp
@@ -191,7 +191,7 @@ getNonzeroElementsCount() const
       auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
          return ( columns_view[ i ] != paddingIndex );
       };
-      return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
+      return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->columnIndexes.getSize(), fetch, std::plus<>{}, 0 );
    }
    else
    {
@@ -869,7 +869,7 @@ operator==( const Matrix& m ) const
    {
       return view1.getRow( i ) == view2.getRow( i );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( 0, this->getRows(), fetch, std::logical_and<>{}, true );
+   return Algorithms::reduce< DeviceType >( 0, this->getRows(), fetch, std::logical_and<>{}, true );
 }
 
 template< typename Real,
diff --git a/src/TNL/Matrices/TridiagonalMatrixView.hpp b/src/TNL/Matrices/TridiagonalMatrixView.hpp
index 3aa633776..5e7bfe756 100644
--- a/src/TNL/Matrices/TridiagonalMatrixView.hpp
+++ b/src/TNL/Matrices/TridiagonalMatrixView.hpp
@@ -133,7 +133,7 @@ getNonzeroElementsCount() const
    auto fetch = [=] __cuda_callable__ ( const IndexType i ) -> IndexType {
       return ( values_view[ i ] != 0.0 );
    };
-   return Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
+   return Algorithms::reduce< DeviceType >( ( IndexType ) 0, this->values.getSize(), fetch, std::plus<>{}, 0 );
 }
 
 template< typename Real,
diff --git a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
index 0fc2bebaf..b722535a5 100644
--- a/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
+++ b/src/TNL/Meshes/MeshDetails/layers/EntityTags/Layer.h
@@ -137,8 +137,8 @@ public:
       {
          return bool(tags_view[ entityIndex ] & EntityTags::GhostEntity);
       };
-      const GlobalIndexType boundaryEntities = Algorithms::Reduction< Device >::reduce( (GlobalIndexType) 0, tags.getSize(), is_boundary, std::plus<>{}, (GlobalIndexType) 0 );
-      const GlobalIndexType ghostEntities = Algorithms::Reduction< Device >::reduce( (GlobalIndexType) 0, tags.getSize(), is_ghost, std::plus<>{}, (GlobalIndexType) 0 );
+      const GlobalIndexType boundaryEntities = Algorithms::reduce< Device >( (GlobalIndexType) 0, tags.getSize(), is_boundary, std::plus<>{}, (GlobalIndexType) 0 );
+      const GlobalIndexType ghostEntities = Algorithms::reduce< Device >( (GlobalIndexType) 0, tags.getSize(), is_ghost, std::plus<>{}, (GlobalIndexType) 0 );
 
       interiorIndices.setSize( tags.getSize() - boundaryEntities );
       boundaryIndices.setSize( boundaryEntities );
diff --git a/src/UnitTests/Matrices/DenseMatrixTest.h b/src/UnitTests/Matrices/DenseMatrixTest.h
index c6dfa3842..ef7d077a5 100644
--- a/src/UnitTests/Matrices/DenseMatrixTest.h
+++ b/src/UnitTests/Matrices/DenseMatrixTest.h
@@ -446,7 +446,7 @@ void test_SetElement()
    auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool {
       return ( v_view[ i ] == m_view.getElement( i, i ) );
    };
-   EXPECT_TRUE( TNL::Algorithms::Reduction< DeviceType >::reduce( ( IndexType ) 0, m.getRows(), fetch, std::logical_and<>{}, true ) );
+   EXPECT_TRUE( TNL::Algorithms::reduce< DeviceType >( ( IndexType ) 0, m.getRows(), fetch, std::logical_and<>{}, true ) );
 
 }
 
-- 
GitLab


From b48ece72c24d2ce83af3e037ea0d817eaa4a063d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 21:45:00 +0200
Subject: [PATCH 11/34] Fixing functionals to wirk with CUDA.

---
 src/TNL/Functional.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index b7a3448b8..bb5b78a54 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -26,7 +26,7 @@ struct Plus
 
    static constexpr Value getIdempotent() { return ( Value ) 0; };
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs + rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs + rhs; }
 };
 
 /**
@@ -41,7 +41,7 @@ struct Plus< void >
    static constexpr T getIdempotent() { return ( T ) 0; };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs + rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs + rhs; }
 };
 
 /**
@@ -56,7 +56,7 @@ struct Multiplies
 
    static constexpr ValueType idempotent = 1;
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs * rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs * rhs; }
 };
 
 /**
@@ -71,7 +71,7 @@ struct Multiplies< void >
    static constexpr T getIdempotent() { return ( T ) 1; };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs * rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs * rhs; }
 };
 
 /**
@@ -86,7 +86,7 @@ struct Min
 
    static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs < rhs ? lhs : rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs < rhs ? lhs : rhs; }
 };
 
 /**
@@ -101,7 +101,7 @@ struct Min< void >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs < rhs ? lhs : rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs < rhs ? lhs : rhs; }
 };
 
 
@@ -117,7 +117,7 @@ struct Max
 
    static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs > rhs ? lhs : rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs > rhs ? lhs : rhs; }
 };
 
 /**
@@ -132,7 +132,7 @@ struct Max< void >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs > rhs ? lhs : rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs > rhs ? lhs : rhs; }
 };
 
 /**
@@ -147,7 +147,7 @@ struct MinWithArg
 
    static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
 
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
    {
       if( lhs > rhs )
       {
@@ -173,7 +173,7 @@ struct MinWithArg< void, void >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
 
    template< typename Value, typename Index >
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
    {
       if( lhs > rhs )
       {
@@ -199,7 +199,7 @@ struct MaxWithArg
 
    static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
 
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
    {
       if( lhs < rhs )
       {
@@ -225,7 +225,7 @@ struct MaxWithArg< void, void >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
 
    template< typename Value, typename Index >
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx )
+   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
    {
       if( lhs < rhs )
       {
@@ -251,7 +251,7 @@ struct LogicalAnd
 
    static constexpr ValueType idempotent = ( Value ) true;
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs && rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs && rhs; }
 };
 
 /**
@@ -266,7 +266,7 @@ struct LogicalAnd< void >
    static constexpr T getIdempotent() { return ( T ) true; };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs && rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs && rhs; }
 };
 
 /**
@@ -281,7 +281,7 @@ struct LogicalOr
 
    static constexpr ValueType idempotent = ( Value ) false;
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs || rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs || rhs; }
 };
 
 /**
@@ -296,7 +296,7 @@ struct LogicalOr< void >
    static constexpr T getIdempotent() { return ( T ) false; };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs || rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs || rhs; }
 };
 
 
@@ -312,7 +312,7 @@ struct BitAnd
 
    static constexpr ValueType idempotent = ~static_cast< ValueType >( 0 );
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs & rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs & rhs; }
 };
 
 /**
@@ -327,7 +327,7 @@ struct BitAnd< void >
    static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs & rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs & rhs; }
 };
 
 /**
@@ -342,7 +342,7 @@ struct BitOr
 
    static constexpr ValueType idempotent =  static_cast< ValueType >( 0 );
 
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) { return lhs | rhs; }
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs | rhs; }
 };
 
 /**
@@ -357,7 +357,7 @@ struct BitOr< void >
    static constexpr T getIdempotent() { return static_cast< T >( 0 ); };
 
    template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) { return lhs | rhs; }
+   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs | rhs; }
 };
 
 } // namespace TNL
-- 
GitLab


From bd49c7e3b1af4720d3cb7f76fd95b49de2a25fd5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Mon, 5 Jul 2021 21:45:44 +0200
Subject: [PATCH 12/34] Fixing Reduction and unit tests to work with CUDA.

---
 src/TNL/Algorithms/detail/Reduction.h    |  14 +--
 src/UnitTests/Algorithms/ReductionTest.h | 143 ++++++++++++++++-------
 2 files changed, 105 insertions(+), 52 deletions(-)

diff --git a/src/TNL/Algorithms/detail/Reduction.h b/src/TNL/Algorithms/detail/Reduction.h
index 6ebef402d..998fa7ff5 100644
--- a/src/TNL/Algorithms/detail/Reduction.h
+++ b/src/TNL/Algorithms/detail/Reduction.h
@@ -70,7 +70,7 @@ struct Reduction< Devices::Sequential >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+           const Result& zero );
 
    /**
     * \brief Computes sequentially reduction on CPU and returns position of an element of interest.
@@ -102,7 +102,7 @@ struct Reduction< Devices::Sequential >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+                       const Result& zero );
 };
 
 template<>
@@ -136,7 +136,7 @@ struct Reduction< Devices::Host >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+           const Result& zero );
 
    /**
     * \brief Computes reduction on CPU and returns position of an element of interest.
@@ -168,7 +168,7 @@ struct Reduction< Devices::Host >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+                       const Result& zero );
 };
 
 template<>
@@ -201,7 +201,7 @@ struct Reduction< Devices::Cuda >
            const Index end,
            Fetch&& fetch,
            Reduce&& reduce,
-           const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+           const Result& zero );
 
    /**
     * \brief Computes reduction on GPU and returns position of an element of interest.
@@ -234,11 +234,11 @@ struct Reduction< Devices::Cuda >
                        const Index end,
                        Fetch&& fetch,
                        Reduce&& reduce,
-                       const Result& zero = Reduce::template getIdempotent< DeviceType >() );
+                       const Result& zero );
 };
 
       } // namespace detail
    } // namespace Algorithms
 } // namespace TNL
 
-#include <TNL/Algorithms/detail/Reduction.hpp>
\ No newline at end of file
+#include <TNL/Algorithms/detail/Reduction.hpp>
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
index e2d573e50..c198f4e3e 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -23,27 +23,10 @@ using namespace TNL;
 
 #ifdef HAVE_GTEST
 
-// test fixture for typed tests
 template< typename Device >
-class ReduceTest : public ::testing::Test
+void ReduceTest_sum()
 {
-protected:
-   using DeviceType = Device;
-};
-
-// types for which ArrayTest is instantiated
-using DeviceTypes = ::testing::Types<
-   Devices::Host
-#ifdef HAVE_CUDA
-   ,Devices::Cuda
-#endif
-   >;
-
-TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
-
-TYPED_TEST( ReduceTest, sum )
-{
-   using Array = Containers::Array< int, Devices::Host >;
+   using Array = Containers::Array< int, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -52,14 +35,15 @@ TYPED_TEST( ReduceTest, sum )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Plus<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus<>{} );
       EXPECT_EQ( res, size );
    }
 }
 
-TYPED_TEST( ReduceTest, min )
+template< typename Device >
+void ReduceTest_min()
 {
-   using Array = Containers::Array< int, Devices::Host >;
+   using Array = Containers::Array< int, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -68,14 +52,15 @@ TYPED_TEST( ReduceTest, min )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Min<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min<>{} );
       EXPECT_EQ( res, 1 );
    }
 }
 
-TYPED_TEST( ReduceTest, max )
+template< typename Device >
+void ReduceTest_max()
 {
-   using Array = Containers::Array< int, Devices::Host >;
+   using Array = Containers::Array< int, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -84,14 +69,15 @@ TYPED_TEST( ReduceTest, max )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::Max<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max<>{} );
       EXPECT_EQ( res, size );
    }
 }
 
-TYPED_TEST( ReduceTest, minWithArg )
+template< typename Device >
+void ReduceTest_minWithArg()
 {
-   using Array = Containers::Array< int, Devices::Host >;
+   using Array = Containers::Array< int, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -100,15 +86,16 @@ TYPED_TEST( ReduceTest, minWithArg )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Devices::Host >( ( int ) 0, size, fetch, TNL::MinWithArg<>{} );
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg<>{} );
       EXPECT_EQ( res.first, 1 );
       EXPECT_EQ( res.second, 0 );
    }
 }
 
-TYPED_TEST( ReduceTest, maxWithArg )
+template< typename Device >
+void ReduceTest_maxWithArg()
 {
-   using Array = Containers::Array< int, Devices::Host >;
+   using Array = Containers::Array< int, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -117,16 +104,16 @@ TYPED_TEST( ReduceTest, maxWithArg )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Devices::Host >( ( int ) 0, size, fetch, TNL::MaxWithArg<>{} );
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg<>{} );
       EXPECT_EQ( res.first, size );
       EXPECT_EQ( res.second, size - 1 );
    }
 }
 
-
-TYPED_TEST( ReduceTest, logicalAnd )
+template< typename Device >
+void ReduceTest_logicalAnd()
 {
-   using Array = Containers::Array< bool, Devices::Host >;
+   using Array = Containers::Array< bool, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -135,14 +122,15 @@ TYPED_TEST( ReduceTest, logicalAnd )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::LogicalAnd<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd<>{} );
       EXPECT_EQ( res, false );
    }
 }
 
-TYPED_TEST( ReduceTest, logicalOr )
+template< typename Device >
+void ReduceTest_logicalOr()
 {
-   using Array = Containers::Array< bool, Devices::Host >;
+   using Array = Containers::Array< bool, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -151,14 +139,15 @@ TYPED_TEST( ReduceTest, logicalOr )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::LogicalOr<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr<>{} );
       EXPECT_EQ( res, true );
    }
 }
 
-TYPED_TEST( ReduceTest, bitAnd )
+template< typename Device >
+void ReduceTest_bitAnd()
 {
-   using Array = Containers::Array< char, Devices::Host >;
+   using Array = Containers::Array< char, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -167,14 +156,15 @@ TYPED_TEST( ReduceTest, bitAnd )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::BitAnd<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd<>{} );
       EXPECT_EQ( res, 1 );
    }
 }
 
-TYPED_TEST( ReduceTest, bitOr )
+template< typename Device >
+void ReduceTest_bitOr()
 {
-   using Array = Containers::Array< char, Devices::Host >;
+   using Array = Containers::Array< char, Device >;
    Array a;
    for( int size = 100; size <= 1000000; size *= 10 )
    {
@@ -183,11 +173,74 @@ TYPED_TEST( ReduceTest, bitOr )
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Devices::Host >( ( int ) 0, size, fetch, TNL::BitOr<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr<>{} );
       EXPECT_EQ( res, ( char ) 255 );
    }
 }
 
+// test fixture for typed tests
+template< typename Device >
+class ReduceTest : public ::testing::Test
+{
+protected:
+   using DeviceType = Device;
+};
+
+// types for which ArrayTest is instantiated
+using DeviceTypes = ::testing::Types<
+   Devices::Host
+#ifdef HAVE_CUDA
+   ,Devices::Cuda
+#endif
+   >;
+
+TYPED_TEST_SUITE( ReduceTest, DeviceTypes );
+
+TYPED_TEST( ReduceTest, sum )
+{
+   ReduceTest_sum< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, min )
+{
+   ReduceTest_min< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, max )
+{
+   ReduceTest_max< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, minWithArg )
+{
+   ReduceTest_minWithArg< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, maxWithArg )
+{
+   ReduceTest_maxWithArg< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, logicalAnd )
+{
+   ReduceTest_logicalAnd< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, logicalOr )
+{
+   ReduceTest_logicalOr< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, bitAnd )
+{
+   ReduceTest_bitAnd< typename TestFixture::DeviceType >();
+}
+
+TYPED_TEST( ReduceTest, bitOr )
+{
+   ReduceTest_bitOr< typename TestFixture::DeviceType >();
+}
+
 #endif
 
 #include "../main.h"
-- 
GitLab


From 2217f745600260c02ba496dd34ff0b7cd03083ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Jul 2021 11:16:28 +0200
Subject: [PATCH 13/34] Fixing documentation of Functionals.h.

---
 src/TNL/Functional.h | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index bb5b78a54..3fb394dc7 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -15,7 +15,7 @@
 namespace TNL {
 
 /**
- * \brief Replacement of std::plus which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::plus optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -30,7 +30,7 @@ struct Plus
 };
 
 /**
- * \brief Replacement of std::plus which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::plus optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -45,7 +45,7 @@ struct Plus< void >
 };
 
 /**
- * \brief Replacement of std::multiplies which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::multiplies optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -60,7 +60,7 @@ struct Multiplies
 };
 
 /**
- * \brief Replacement of std::multiplies which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::multiplies optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -75,7 +75,7 @@ struct Multiplies< void >
 };
 
 /**
- * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -90,7 +90,7 @@ struct Min
 };
 
 /**
- * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -106,7 +106,7 @@ struct Min< void >
 
 
 /**
- * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -121,7 +121,7 @@ struct Max
 };
 
 /**
- * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -136,7 +136,7 @@ struct Max< void >
 };
 
 /**
- * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * \tparam Value is data type.
  */
@@ -162,7 +162,7 @@ struct MinWithArg
 };
 
 /**
- * \brief Replacement of std::min which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -188,7 +188,7 @@ struct MinWithArg< void, void >
 };
 
 /**
- * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * \tparam Value is data type.
  */
@@ -214,7 +214,7 @@ struct MaxWithArg
 };
 
 /**
- * \brief Replacement of std::max which is optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -240,7 +240,7 @@ struct MaxWithArg< void, void >
 };
 
 /**
- * \brief Replacement of std::logical_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::logical_and optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -255,7 +255,7 @@ struct LogicalAnd
 };
 
 /**
- * \brief Replacement of std::logical_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::logical_and optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -270,7 +270,7 @@ struct LogicalAnd< void >
 };
 
 /**
- * \brief Replacement of std::logical_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::logical_or optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -285,7 +285,7 @@ struct LogicalOr
 };
 
 /**
- * \brief Replacement of std::logical_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::logical_or optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -301,7 +301,7 @@ struct LogicalOr< void >
 
 
 /**
- * \brief Replacement of std::bit_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::bit_and optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -316,7 +316,7 @@ struct BitAnd
 };
 
 /**
- * \brief Replacement of std::bit_and which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::bit_and optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
@@ -331,7 +331,7 @@ struct BitAnd< void >
 };
 
 /**
- * \brief Replacement of std::bit_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::bit_or optimized for use with \ref TNL::Algorithms::reduce.
  *
  * \tparam Value is data type.
  */
@@ -346,7 +346,7 @@ struct BitOr
 };
 
 /**
- * \brief Replacement of std::bit_or which is optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Replacement of std::bit_or optimized for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-- 
GitLab


From 30139b7c26d4cc9bd3ed5e90097205d2114ae55a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Jul 2021 11:17:07 +0200
Subject: [PATCH 14/34] Adding functionals to parallel reduction tutorial.

---
 .../Tutorials/ReductionAndScan/CMakeLists.txt |  1 +
 .../ReductionAndScan/ComparisonExample.cpp    |  4 +-
 .../ReductionAndScan/MapReduceExample-2.cpp   |  4 +-
 .../ReductionAndScan/MapReduceExample-3.cpp   |  4 +-
 .../ReductionAndScan/ProductExample.cpp       |  4 +-
 .../ScalarProductWithFunctionalExample.cpp    | 52 +++++++++++++
 .../ScalarProductWithFunctionalExample.cu     |  1 +
 .../ReductionAndScan/ScanExample.cpp          |  7 +-
 .../SequentialSumWithLambdas.cpp              |  4 +-
 .../Tutorials/ReductionAndScan/SumExample.cpp |  4 +-
 .../tutorial_ReductionAndScan.md              | 78 ++++++++++++-------
 11 files changed, 121 insertions(+), 42 deletions(-)
 create mode 100644 Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
 create mode 120000 Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu

diff --git a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
index 547a55574..594ebd8cd 100644
--- a/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
+++ b/Documentation/Tutorials/ReductionAndScan/CMakeLists.txt
@@ -3,6 +3,7 @@ set( COMMON_EXAMPLES
      SumExampleWithFunctional
      ProductExample
      ScalarProductExample
+     ScalarProductWithFunctionalExample
      MaximumNormExample
      ComparisonExample
      UpdateAndResidueExample
diff --git a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
index 46d6e50a2..8972af7f4 100644
--- a/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ComparisonExample.cpp
@@ -21,8 +21,8 @@ bool comparison( const Vector< double, Device >& u, const Vector< double, Device
    /***
     * Reduce performs logical AND on intermediate results obtained by fetch.
     */
-   auto reduce_ = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
-   return reduce< Device >( 0, v_view.getSize(), fetch, reduce_, true );
+   auto reduction = [] __cuda_callable__ ( const bool& a, const bool& b ) { return a && b; };
+   return reduce< Device >( 0, v_view.getSize(), fetch, reduction, true );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
index da873ca6f..065f4608a 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-2.cpp
@@ -27,7 +27,7 @@ int main( int argc, char* argv[] )
    timer.start();
    double result = mapReduce( host_u );
    timer.stop();
-   std::cout << "Host tesult is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "Host tesult is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #ifdef HAVE_CUDA
    Vector< double, Devices::Cuda > cuda_u( 100000 );
    cuda_u = 1.0;
@@ -35,7 +35,7 @@ int main( int argc, char* argv[] )
    timer.start();
    result = mapReduce( cuda_u );
    timer.stop();
-   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
index cdd677f87..f3c54f6b0 100644
--- a/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/MapReduceExample-3.cpp
@@ -26,7 +26,7 @@ int main( int argc, char* argv[] )
    timer.start();
    double result = mapReduce( host_u );
    timer.stop();
-   std::cout << "Host result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "Host result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #ifdef HAVE_CUDA
    Vector< double, Devices::Cuda > cuda_u( 100000 );
    cuda_u = 1.0;
@@ -34,7 +34,7 @@ int main( int argc, char* argv[] )
    timer.start();
    result = mapReduce( cuda_u );
    timer.stop();
-   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << "seconds." << std::endl;
+   std::cout << "CUDA result is:" << result << ". It took " << timer.getRealTime() << " seconds." << std::endl;
 #endif
    return EXIT_SUCCESS;
 }
diff --git a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
index e4cd58b5e..389ecd497 100644
--- a/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ProductExample.cpp
@@ -12,12 +12,12 @@ double product( const Vector< double, Device >& v )
 {
    auto view = v.getConstView();
    auto fetch = [=] __cuda_callable__ ( int i ) { return view[ i ]; };
-   auto reduce_ = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a * b; };
 
    /***
     * Since we compute the product of all elements, the reduction must be initialized by 1.0 not by 0.0.
     */
-   return reduce< Device >( 0, view.getSize(), fetch, reduce_, 1.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 1.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
new file mode 100644
index 000000000..74d4ba713
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <cstdlib>
+#include <TNL/Containers/Vector.h>
+#include <TNL/Algorithms/Reduction.h>
+
+using namespace TNL;
+using namespace TNL::Containers;
+using namespace TNL::Algorithms;
+
+template< typename Device >
+double scalarProduct( const Vector< double, Device >& u, const Vector< double, Device >& v )
+{
+   auto u_view = u.getConstView();
+   auto v_view = v.getConstView();
+
+   /***
+    * Fetch computes product of corresponding elements of both vectors.
+    */
+   return reduce< Device >(
+      0, v_view.getSize(),
+      [=] __cuda_callable__ ( int i ) { return u_view[ i ] * v_view[ i ]; },
+      TNL::Plus<>{} );
+}
+
+int main( int argc, char* argv[] )
+{
+   /***
+    * The first test on CPU ...
+    */
+   Vector< double, Devices::Host > host_u( 10 ), host_v( 10 );
+   host_u = 1.0;
+   host_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = 2 * ( i % 2 ) - 1; } );
+   std::cout << "host_u = " << host_u << std::endl;
+   std::cout << "host_v = " << host_v << std::endl;
+   std::cout << "The scalar product ( host_u, host_v ) is " << scalarProduct( host_u, host_v ) << "." << std::endl;
+   std::cout << "The scalar product ( host_v, host_v ) is " << scalarProduct( host_v, host_v ) << "." << std::endl;
+
+   /***
+    * ... the second test on GPU.
+    */
+#ifdef HAVE_CUDA
+   Vector< double, Devices::Cuda > cuda_u( 10 ), cuda_v( 10 );
+   cuda_u = 1.0;
+   cuda_v.forAllElements( [] __cuda_callable__ ( int i, double& value ) { value = 2 * ( i % 2 ) - 1; } );
+   std::cout << "cuda_u = " << cuda_u << std::endl;
+   std::cout << "cuda_v = " << cuda_v << std::endl;
+   std::cout << "The scalar product ( cuda_u, cuda_v ) is " << scalarProduct( cuda_u, cuda_v ) << "." << std::endl;
+   std::cout << "The scalar product ( cuda_v, cuda_v ) is " << scalarProduct( cuda_v, cuda_v ) << "." << std::endl;
+#endif
+   return EXIT_SUCCESS;
+}
+
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu
new file mode 120000
index 000000000..8eef06256
--- /dev/null
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cu
@@ -0,0 +1 @@
+ScalarProductWithFunctionalExample.cpp
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
index 3dbd8581d..5281bfd54 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScanExample.cpp
@@ -12,14 +12,14 @@ void scan( Vector< double, Device >& v )
    /***
     * Reduction is sum of two numbers.
     */
-   auto reduce = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
     * As parameters, we pass vector on which the scan is to be performed, interval
     * where the scan is performed, lambda function which is used by the scan and
     * zero element (idempotent) of the 'sum' operation.
     */
-   Scan< Device >::perform( v, 0, v.getSize(), reduce, 0.0 );
+   Scan< Device >::perform( v, 0, v.getSize(), reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
@@ -44,5 +44,4 @@ int main( int argc, char* argv[] )
    std::cout << "The prefix sum of the CUDA vector is " << cuda_v << "." << std::endl;
 #endif
    return EXIT_SUCCESS;
-}
-
+}
\ No newline at end of file
diff --git a/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp b/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
index 0932b8b18..377040c76 100644
--- a/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SequentialSumWithLambdas.cpp
@@ -1,11 +1,11 @@
 double sequentialSum( const double* a, const int size )
 {
    auto fetch = [=] (int i)->double { return a[ i ]; };
-   auto reduce = [] (double& x, const double& y) { return x + y; };
+   auto reduction = [] (double& x, const double& y) { return x + y; };
 
    double sum( 0.0 );
    for( int i = 0; i < size; i++ )
-       sum = reduce( sum, fetch( i ) );
+       sum = reduction( sum, fetch( i ) );
    return sum;
 }
 
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
index 8a60888b6..cfa6e1bef 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExample.cpp
@@ -23,14 +23,14 @@ double sum( const Vector< double, Device >& v )
    /***
     * Reduction is sum of two numbers.
     */
-   auto reduce_ = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
+   auto reduction = [] __cuda_callable__ ( const double& a, const double& b ) { return a + b; };
 
    /***
     * Finally we call the templated function Reduction and pass number of elements to reduce,
     * lambdas defined above and finally value of idempotent element, zero in this case, which serve for the
     * reduction initiation.
     */
-   return reduce< Device >( 0, view.getSize(), fetch, reduce_, 0.0 );
+   return reduce< Device >( 0, view.getSize(), fetch, reduction, 0.0 );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index 92e3ea620..ca729aaba 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -14,11 +14,11 @@ We will explain the *flexible parallel reduction* on several examples. We start
 
 We start with simple problem of computing sum of sequence of numbers \f[ s = \sum_{i=1}^n a_i. \f] Sequentialy, such sum can be computed very easily as follows:
 
-\include SequentialSum.cpp
+\includelineno SequentialSum.cpp
 
 Doing the same in CUDA for GPU is, however, much more difficult (see. [Optimizing Parallel Reduction in CUDA](https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf)). The final code has tens of lines and it is something you do not want to write again and again anytime you need to sum a series of numbers. Using TNL and C++ lambda functions we may do the same on few lines of code efficiently and independently on the hardware beneath. Let us first rewrite the previous example using the C++ lambda functions:
 
-\include SequentialSumWithLambdas.cpp
+\includelineno SequentialSumWithLambdas.cpp
 
 As can be seen, we split the reduction into two steps:
 
@@ -26,15 +26,15 @@ As can be seen, we split the reduction into two steps:
    1. Connect the reduction algorithm with given input arrays or vectors (or any other data structure).
    2. Perform operation you need to do with the input data.
    3. Perform another secondary operation simoultanously with the parallel reduction.
-2. `reduce` is operation we want to do after the data fetch. Usually it is summation, multiplication, evaluation of minimum or maximum or some logical operation.
+2. `reduction` is operation we want to do after the data fetch. Usually it is summation, multiplication, evaluation of minimum or maximum or some logical operation.
 
 Putting everything together gives the following example:
 
-\include SumExample.cpp
+\includelineno SumExample.cpp
 
 Since TNL vectors cannot be pass to CUDA kernels and so they cannot be captured by CUDA lambdas, we must first get vector view from the vector using a method `getConstView()`.
 
-Note tha we pass `0.0` as the last argument of the method `reduce< Device >`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
+Note tha we pass `0.0` as the last argument of the template function `reduce< Device >`. It is an *idempotent element* (see [Idempotence](https://cs.wikipedia.org/wiki/Idempotence)). It is an element which, for given operation, does not change the result. For addition, it is zero. The result looks as follows.
 
 \include SumExample.out
 
@@ -42,9 +42,9 @@ Sum of vector elements can be also obtained as [`sum(v)`](../html/namespaceTNL.h
 
 ### Product
 
-To demonstrate the effect of the *idempotent element*, we will now compute product of all elements of the vector. The *idempotent element* is one for multiplication and we also need to replace `a+b` with `a*b` in the definition of `reduce`. We get the following code:
+To demonstrate the effect of the *idempotent element*, we will now compute product of all elements of the vector. The *idempotent element* is one for multiplication and we also need to replace `a+b` with `a*b` in the definition of `reduction`. We get the following code:
 
-\include ProductExample.cpp
+\includelineno ProductExample.cpp
 
 leading to output like this:
 
@@ -56,7 +56,7 @@ Product of vector elements can be computed using fuction [`product(v)`](../html/
 
 One of the most important operation in the linear algebra is the scalar product of two vectors. Compared to coputing the sum of vector elements we must change the function `fetch` to read elements from both vectors and multiply them. See the following example.
 
-\include ScalarProductExample.cpp
+\includelineno ScalarProductExample.cpp
 
 The result is:
 
@@ -64,11 +64,11 @@ The result is:
 
 Scalar product of vectors `u` and `v` in TNL can be computed by \ref TNL::dot "TNL::dot(u, v)" or simply as \ref TNL::Containers::operator, "(u, v)".
 
-### Maxium norm
+### Maximum norm
 
-Maximum norm of a vector equals modulus of the vector largest element.  Therefore, `fetch` must return the absolute value of the vector elements and `reduce` wil return maximum of given values. Look at the following example.
+Maximum norm of a vector equals modulus of the vector largest element.  Therefore, `fetch` must return the absolute value of the vector elements and `reduction` wil return maximum of given values. Look at the following example.
 
-\include MaximumNormExample.cpp
+\includelineno MaximumNormExample.cpp
 
 The output is:
 
@@ -78,9 +78,9 @@ Maximum norm in TNL is computed by the function \ref TNL::maxNorm.
 
 ### Vectors comparison
 
-Comparison of two vectors involve (parallel) reduction as well. The `fetch` part is responsible for comparison of corresponding vector elements result of which is boolean `true` or `false` for each vector elements. The `reduce` part must perform logical and operation on all of them. We must not forget to change the *idempotent element* to `true`. The code may look as follows:
+Comparison of two vectors involve (parallel) reduction as well. The `fetch` part is responsible for comparison of corresponding vector elements result of which is boolean `true` or `false` for each vector elements. The `reduction` part must perform logical and operation on all of them. We must not forget to change the *idempotent element* to `true`. The code may look as follows:
 
-\include ComparisonExample.cpp
+\includelineno ComparisonExample.cpp
 
 And the output looks as:
 
@@ -96,7 +96,7 @@ In iterative solvers we often need to update a vector and compute the update nor
 
 Together with the vector addition, we may want to compute also \f$L_2\f$-norm of \f$\Delta \bf u\f$ which may indicate convergence. Computing first the addition and then the norm would be inefficient because we would have to fetch the vector \f$\Delta \bf u\f$ twice from the memory. The following example shows how to do the addition and norm computation at the same time.
 
-\include UpdateAndResidueExample.cpp
+\includelineno UpdateAndResidueExample.cpp
 
 The result reads as:
 
@@ -112,7 +112,7 @@ return u_view[ i ] > 0.0 ? u_view[ i ] : 0.0;
 
 to sum up only the positive numbers in the vector.
 
-\include MapReduceExample-1.cpp
+\includelineno MapReduceExample-1.cpp
 
 The result is:
 
@@ -120,7 +120,7 @@ The result is:
 
 Take a look at the following example where the filtering depends on the element indexes rather than values:
 
-\include MapReduceExample-2.cpp
+\includelineno MapReduceExample-2.cpp
 
 The result is:
 
@@ -134,28 +134,54 @@ return u_view[ 2 * i ];
 
 See the following example and compare the execution times.
 
-\include MapReduceExample-3.cpp
+\includelineno MapReduceExample-3.cpp
 
 \include MapReduceExample-3.out
 
 ### Reduction with argument
 
-In some situations we may need to locate given element in the vector. For example index of the smallest or the largest element. `reductionWithArgument` is a function which can do it. In the following example, we modify function for computing the maximum norm of a vedctor. Instead of just computing the value, now we want to get index of the element having the absolute value equal to the max norm. The lambda function `reduction` do not compute only maximum of two given elements anymore, but it must also compute index of the winner. See the following code:
+In some situations we may need to locate given element in the vector. For example index of the smallest or the largest element. `reduceWithArgument` is a function which can do it. In the following example, we modify function for computing the maximum norm of a vector. Instead of just computing the value, now we want to get index of the element having the absolute value equal to the max norm. The lambda function `reduction` do not compute only maximum of two given elements anymore, but it must also compute index of the winner. See the following code:
 
-\include ReductionWithArgument.cpp
+\includelineno ReductionWithArgument.cpp
 
 The definition of the lambda function `reduction` reads as:
 
 ```
-auto reduction = [] __cuda_callable__ ( int& aIdx, const int& bIdx, double& a, const double& b );
+auto reduction = [] __cuda_callable__ ( double& a, const double& b, int& aIdx, const int& bIdx );
 ```
 
-In addition to vector elements valuesd `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
+In addition to vector elements values `a` and `b`, it gets also their positions `aIdx` and `bIdx`. The functions is responsible to set `a` to maximum of the two and `aIdx` to the position of the larger element. Note, that the parameters have the above mentioned meaning only in case of computing minimum or maximum.
 
 The result looks as:
 
 \include ReductionWithArgument.out
 
+### Using functionals for reduction
+
+You might notice, that the lambda function `reduction` does not take so many different form compared to fetch. In addition, setting the zero (or idempotent) element can be annoying especially when computing minimum or maximum and we need to check std::limits function to make the code working with any type. To make things simpler, TNL offers variants of several functionals known from STL. They can be used instead of the lambda function `reduction` and they also carry the idempotent element. See the following example showing the scalar product of two vectors, now with functional:
+
+\includelineno ScalarProductWithFunctionalExample.cpp
+
+
+This example also shows more compact how to evoke the function `reduce` (lines 19-22). This way, one should be able to perform (parallel) reduction very easily. The result looks as follows:
+
+\include ScalarProductWithFunctionalExample.out
+
+In \ref TNL/Functionals.h you may find probably all operations that can be reasonably used for reduction:
+
+| Functional                        | Reduction operation      |
+|-----------------------------------|--------------------------|
+| \ref TNL::Plus<>                  | Sum                      |
+| \ref TNL::Multiplies<>            | Product                  |
+| \ref TNL::Min<>                   | Minimum                  |
+| \ref TNL::Max<>                   | Maximum                  |
+| \ref TNL::MinWithArg<>            | Minimum with argument    |
+| \ref TNL::MaxWithArg<>            | Maximum with argument    |
+| \ref TNL::LogicalAnd<>            | Logical AND              |
+| \ref TNL::LogicalOr<>             | Logical OR               |
+| \ref TNL::BitAnd<>                | Bit AND                  |
+| \ref TNL::BitOr<>                 | Bit OR                   |
+
 ## Flexible scan
 
 ### Inclusive and exclusive scan
@@ -192,7 +218,7 @@ and exclusive prefix sum of the same sequence is
 
 Both kinds of [scan](https://en.wikipedia.org/wiki/Prefix_sum) are usually applied only on sumation, however product or logical operations could be handy as well. In TNL, prefix sum is implemented in simillar way as reduction and so it can be easily modified by lambda functions. The following example shows how it works:
 
-\include ScanExample.cpp
+\includelineno ScanExample.cpp
 
 Scan does not use `fetch` function because the scan must be performed on a vector (the first parameter we pass to the scan). Its complexity is also higher compared to reduction. Thus if one needs to do some operation with the vector elements before the scan, this can be done explicitly and it will not affect the performance significantlty. On the other hand, the scan function takes interval of the vector elements where the scan is performed as its second and third argument. The next argument is the operation to be performed by the scan and the last parameter is the idempotent ("zero") element if the operation.
 
@@ -203,12 +229,12 @@ The result looks as:
 Exclusive scan works the same way, we just need to specify it by the second template parameter which is set to `ScanType::Exclusive`. The call of the scan then looks as
 
 ```
-Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduce, 0.0 );
+Scan< Device, ScanType::Exclusive >::perform( v, 0, v.getSize(), reduction, 0.0 );
 ```
 
 The complete example looks as follows:
 
-\include ExclusiveScanExample.cpp
+\includelineno ExclusiveScanExample.cpp
 
 And the result looks as:
 
@@ -242,8 +268,8 @@ In addition to common scan, we need to encode the segments of the input sequence
 ```
 **Note: Segmented scan is not implemented for CUDA yet.**
 
-\include SegmentedScanExample.cpp
+\includelineno SegmentedScanExample.cpp
 
 The result reads as:
 
-\include SegmentedScanExample.out
+\include SegmentedScanExample.out
\ No newline at end of file
-- 
GitLab


From 4596dc2a94d2dfc5fad27b5c718c0fdc2f37276f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Jul 2021 14:03:01 +0200
Subject: [PATCH 15/34] Refuctoring Functional.h

---
 .../ReductionWithArgumentWithFunctional.cpp   |   2 +-
 .../ScalarProductWithFunctionalExample.cpp    |   2 +-
 .../SumExampleWithFunctional.cpp              |   2 +-
 .../tutorial_ReductionAndScan.md              |  24 +-
 src/TNL/Functional.h                          | 252 ++----------------
 src/UnitTests/Algorithms/ReductionTest.h      |  18 +-
 6 files changed, 50 insertions(+), 250 deletions(-)

diff --git a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
index f2fa6208f..7b084db0e 100644
--- a/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ReductionWithArgumentWithFunctional.cpp
@@ -14,7 +14,7 @@ maximumNorm( const Vector< double, Device >& v )
    auto view = v.getConstView();
 
    auto fetch = [=] __cuda_callable__ ( int i ) { return abs( view[ i ] ); };
-   return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg<>{} );
+   return reduceWithArgument< Device >( 0, view.getSize(), fetch, TNL::MaxWithArg{} );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
index 74d4ba713..4838f5f77 100644
--- a/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/ScalarProductWithFunctionalExample.cpp
@@ -19,7 +19,7 @@ double scalarProduct( const Vector< double, Device >& u, const Vector< double, D
    return reduce< Device >(
       0, v_view.getSize(),
       [=] __cuda_callable__ ( int i ) { return u_view[ i ] * v_view[ i ]; },
-      TNL::Plus<>{} );
+      TNL::Plus{} );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
index b68ad0a29..9ef7795cd 100644
--- a/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
+++ b/Documentation/Tutorials/ReductionAndScan/SumExampleWithFunctional.cpp
@@ -24,7 +24,7 @@ double sum( const Vector< double, Device >& v )
     * Finally we call the templated function Reduction and pass number of elements to reduce,
     * lambda defined above and functional representing the reduction operation.
     */
-   return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus<>{} );
+   return reduce< Device >( 0, view.getSize(), fetch, TNL::Plus{} );
 }
 
 int main( int argc, char* argv[] )
diff --git a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
index ca729aaba..35246fe4e 100644
--- a/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
+++ b/Documentation/Tutorials/ReductionAndScan/tutorial_ReductionAndScan.md
@@ -169,18 +169,18 @@ This example also shows more compact how to evoke the function `reduce` (lines 1
 
 In \ref TNL/Functionals.h you may find probably all operations that can be reasonably used for reduction:
 
-| Functional                        | Reduction operation      |
-|-----------------------------------|--------------------------|
-| \ref TNL::Plus<>                  | Sum                      |
-| \ref TNL::Multiplies<>            | Product                  |
-| \ref TNL::Min<>                   | Minimum                  |
-| \ref TNL::Max<>                   | Maximum                  |
-| \ref TNL::MinWithArg<>            | Minimum with argument    |
-| \ref TNL::MaxWithArg<>            | Maximum with argument    |
-| \ref TNL::LogicalAnd<>            | Logical AND              |
-| \ref TNL::LogicalOr<>             | Logical OR               |
-| \ref TNL::BitAnd<>                | Bit AND                  |
-| \ref TNL::BitOr<>                 | Bit OR                   |
+| Functional                      | Reduction operation      |
+|---------------------------------|--------------------------|
+| \ref TNL::Plus                  | Sum                      |
+| \ref TNL::Multiplies            | Product                  |
+| \ref TNL::Min                   | Minimum                  |
+| \ref TNL::Max                   | Maximum                  |
+| \ref TNL::MinWithArg            | Minimum with argument    |
+| \ref TNL::MaxWithArg            | Maximum with argument    |
+| \ref TNL::LogicalAnd            | Logical AND              |
+| \ref TNL::LogicalOr             | Logical OR               |
+| \ref TNL::BitAnd                | Bit AND                  |
+| \ref TNL::BitOr                 | Bit OR                   |
 
 ## Flexible scan
 
diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 3fb394dc7..57d325889 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -10,164 +10,68 @@
 
 #pragma once
 
+#include <functional>
+#include <algorithm>
 #include <limits>
 
 namespace TNL {
 
 /**
- * \brief Replacement of std::plus optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct Plus
-{
-   using ValueType = Value;
-
-   static constexpr Value getIdempotent() { return ( Value ) 0; };
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs + rhs; }
-};
-
-/**
- * \brief Replacement of std::plus optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::plus for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct Plus< void >
+struct Plus : public std::plus< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return ( T ) 0; };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs + rhs; }
 };
 
 /**
- * \brief Replacement of std::multiplies optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct Multiplies
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = 1;
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs * rhs; }
-};
-
-/**
- * \brief Replacement of std::multiplies optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::multiplies for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct Multiplies< void >
+struct Multiplies : public std::multiplies< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return ( T ) 1; };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs * rhs; }
-};
-
-/**
- * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct Min
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs < rhs ? lhs : rhs; }
 };
 
 /**
- * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::min for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct Min< void >
+struct Min
 {
    template< typename T >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
 
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs < rhs ? lhs : rhs; }
-};
-
-
-/**
- * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct Max
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs > rhs ? lhs : rhs; }
+   template< typename Value >
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs < rhs ? lhs : rhs; }
 };
 
 /**
- * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::max for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct Max< void >
+struct Max
 {
    template< typename T >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
 
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs > rhs ? lhs : rhs; }
-};
-
-/**
- * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduceWithArgument.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void, typename Index = void >
-struct MinWithArg
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = std::numeric_limits< Value >::max();
-
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
-   {
-      if( lhs > rhs )
-      {
-         lhs = rhs;
-         lhsIdx = rhsIdx;
-      }
-      else if( lhs == rhs && rhsIdx < lhsIdx )
-      {
-         lhsIdx = rhsIdx;
-      }
-   }
+   template< typename Value >
+   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs > rhs ? lhs : rhs; }
 };
 
 /**
- * \brief Replacement of std::min optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Extension of std::min for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct MinWithArg< void, void >
+struct MinWithArg
 {
    template< typename T >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::max(); };
@@ -188,38 +92,11 @@ struct MinWithArg< void, void >
 };
 
 /**
- * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduceWithArgument.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void, typename Index = void >
-struct MaxWithArg
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = std::numeric_limits< Value >::min();
-
-   constexpr void operator()( Value& lhs, const Value& rhs, Index& lhsIdx, const Index& rhsIdx ) const
-   {
-      if( lhs < rhs )
-      {
-         lhs = rhs;
-         lhsIdx = rhsIdx;
-      }
-      else if( lhs == rhs && rhsIdx < lhsIdx )
-      {
-         lhsIdx = rhsIdx;
-      }
-   }
-};
-
-/**
- * \brief Replacement of std::max optimized for use with \ref TNL::Algorithms::reduceWithArgument.
+ * \brief Extension of std::max for use with \ref TNL::Algorithms::reduceWithArgument.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct MaxWithArg< void, void >
+struct MaxWithArg
 {
    template< typename T >
    static constexpr T getIdempotent() { return std::numeric_limits< T >::min(); };
@@ -240,124 +117,47 @@ struct MaxWithArg< void, void >
 };
 
 /**
- * \brief Replacement of std::logical_and optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct LogicalAnd
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = ( Value ) true;
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs && rhs; }
-};
-
-/**
- * \brief Replacement of std::logical_and optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::logical_and for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct LogicalAnd< void >
+struct LogicalAnd : public std::logical_and< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return ( T ) true; };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs && rhs; }
 };
 
 /**
- * \brief Replacement of std::logical_or optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct LogicalOr
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = ( Value ) false;
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs || rhs; }
-};
-
-/**
- * \brief Replacement of std::logical_or optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::logical_or for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct LogicalOr< void >
+struct LogicalOr : public std::logical_or< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return ( T ) false; };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs || rhs; }
 };
 
-
 /**
- * \brief Replacement of std::bit_and optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct BitAnd
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent = ~static_cast< ValueType >( 0 );
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs & rhs; }
-};
-
-/**
- * \brief Replacement of std::bit_and optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::bit_and for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct BitAnd< void >
+struct BitAnd : public std::bit_and< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return ~static_cast< T >( 0 ); };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs & rhs; }
-};
-
-/**
- * \brief Replacement of std::bit_or optimized for use with \ref TNL::Algorithms::reduce.
- *
- * \tparam Value is data type.
- */
-template< typename Value = void >
-struct BitOr
-{
-   using ValueType = Value;
-
-   static constexpr ValueType idempotent =  static_cast< ValueType >( 0 );
-
-   constexpr Value operator()( const Value& lhs, const Value& rhs ) const { return lhs | rhs; }
 };
 
 /**
- * \brief Replacement of std::bit_or optimized for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of std::bit_or for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-template<>
-struct BitOr< void >
+struct BitOr : public std::bit_or< void >
 {
    template< typename T >
    static constexpr T getIdempotent() { return static_cast< T >( 0 ); };
-
-   template< typename T >
-   constexpr T operator()( const T& lhs, const T& rhs ) const { return lhs | rhs; }
 };
 
 } // namespace TNL
diff --git a/src/UnitTests/Algorithms/ReductionTest.h b/src/UnitTests/Algorithms/ReductionTest.h
index c198f4e3e..b880642b8 100644
--- a/src/UnitTests/Algorithms/ReductionTest.h
+++ b/src/UnitTests/Algorithms/ReductionTest.h
@@ -35,7 +35,7 @@ void ReduceTest_sum()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Plus{} );
       EXPECT_EQ( res, size );
    }
 }
@@ -52,7 +52,7 @@ void ReduceTest_min()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Min{} );
       EXPECT_EQ( res, 1 );
    }
 }
@@ -69,7 +69,7 @@ void ReduceTest_max()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::Max{} );
       EXPECT_EQ( res, size );
    }
 }
@@ -86,7 +86,7 @@ void ReduceTest_minWithArg()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg<>{} );
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MinWithArg{} );
       EXPECT_EQ( res.first, 1 );
       EXPECT_EQ( res.second, 0 );
    }
@@ -104,7 +104,7 @@ void ReduceTest_maxWithArg()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg<>{} );
+      auto res = Algorithms::reduceWithArgument< Device >( ( int ) 0, size, fetch, TNL::MaxWithArg{} );
       EXPECT_EQ( res.first, size );
       EXPECT_EQ( res.second, size - 1 );
    }
@@ -122,7 +122,7 @@ void ReduceTest_logicalAnd()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalAnd{} );
       EXPECT_EQ( res, false );
    }
 }
@@ -139,7 +139,7 @@ void ReduceTest_logicalOr()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::LogicalOr{} );
       EXPECT_EQ( res, true );
    }
 }
@@ -156,7 +156,7 @@ void ReduceTest_bitAnd()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitAnd{} );
       EXPECT_EQ( res, 1 );
    }
 }
@@ -173,7 +173,7 @@ void ReduceTest_bitOr()
       auto a_view = a.getView();
 
       auto fetch = [=] __cuda_callable__ ( int idx ) { return a_view[ idx ]; };
-      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr<>{} );
+      auto res = Algorithms::reduce< Device >( ( int ) 0, size, fetch, TNL::BitOr{} );
       EXPECT_EQ( res, ( char ) 255 );
    }
 }
-- 
GitLab


From fba5ccefd85f553f78e6c1e76f042a5589036993 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Jul 2021 14:33:15 +0200
Subject: [PATCH 16/34] Refuctoring Functional.h

---
 .../Tutorials/Vectors/tutorial_Vectors.md     | 49 +++++++++++++++++--
 1 file changed, 46 insertions(+), 3 deletions(-)

diff --git a/Documentation/Tutorials/Vectors/tutorial_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 5ac66ccd7..2184fac70 100644
--- a/Documentation/Tutorials/Vectors/tutorial_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -20,24 +20,67 @@ This tutorial introduces vectors in TNL. `Vector`, in addition to `Array`, offer
 
 By *horizontal* operations we mean vector expressions where we have one or more vectors as an input and a vector as an output. In TNL, this kind of operations is performed by the [Expression Templates](https://en.wikipedia.org/wiki/Expression_templates). It makes algebraic operations with vectors easy to do and very efficient at the same time. In some cases, one get even more efficient code compared to [Blas](https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms) and [cuBlas](https://developer.nvidia.com/cublas). See the following example.
 
-\include Expressions.cpp
+\includelineno Expressions.cpp
 
 Output is:
 
 \include Expressions.out
 
-Vector expressions work only with `VectorView` not with `Vector`. The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like `min`, `max`, `abs`, `sin`, `cos`, `exp`, `log`, `sqrt`, `pow` etc.
+Vector expressions work only with `VectorView` not with `Vector`. The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like the following:
+
+| \ref TNL::min     | Minimas of input vector expressions elements.               |
+| \ref TNL::max     | Maximas of input vector expressions elements.               |
+| \ref TNL::abs     | Absolute values of input vector expression elements.        |
+| \ref TNL::sin     | Sine of input vector expression elements.                   |
+| \ref TNL::cos     | Cosine of input vector expression elements.                 |
+| \ref TNL::tan     | Tangent of input vector expression elements.                |
+| \ref TNL::asin    | Arc sine of input vector expression elements.               |
+| \ref TNL::acos    | Arc cosine of input vector expression elements.             |
+| \ref TNL::atan    | Arc tangent of input vector expression elements.            |
+| \ref TNL::sinh    | Hyperbolic sine of input vector expression elements.        |
+| \ref TNL::cosh    | Hyperbolic cosine of input vector expression elements.      |
+| \ref TNL::tanh    | Hyperbolic tangent of input vector expression elements.     |
+| \ref TNL::asinh   | Arc hyperbolic sine of input vector expression elements.    |
+| \ref TNL::acosh   | Arc hyperbolic cosine of input vector expression elements.  |
+| \ref TNL::atanh   | Arc hyperbolic tangent of input vector expression elements. |
+| \ref TNL::exp     | Exponential function of input vector expression elements.   |
+| \ref TNL::log     | Natural logarithm of input vector expression elements.      |
+| \ref TNL::log10   | Decadic logarithm of input vector expression elements.      |
+| \ref TNL::log2    | Binary logarithm of input vector expression elements.       |
+| \ref TNL::sqrt    | Square root of input vector expression elements.            |
+| \ref TNL::cbrt    | Cubic root of input vector expression elements.             |
+| \ref TNL::pow     | Power of of input vector expression elements.               |
+| \ref TNL::floor   | Rounds downward input vector expression elements.           |
+| \ref TNL::ceil    | Rounds upward of input vector expression elements.          |
+| \ref TNL::sign    | Signum of input vector expression elements.                 |
 
 ### Vertical operations
 
 By *vertical operations* we mean (parallel) reduction based operations where we have one vector expressions as an input and one value as an output. For example computing scalar product, vector norm or finding minimum or maximum of vector elements is based on reduction. See the following example.
 
-\include Reduction.cpp
+\includelineno Reduction.cpp
 
 Output is:
 
 \include Reduction.out
 
+The following table shows vertical operations that can be used on vector expressions:
+
+| \ref TNL::min        | Minimum of vector expression elements.                                    |
+| \ref TNL::argMin     | Minimum of vector expression elements with index of the smallest element. |
+| \ref TNL::max        | Maximum of vector expression elements.                                    |
+| \ref TNL::argMax     | Minimum of vector expression elements with index of the smallest element. |
+| \ref TNL::sum        | Sum of vector expression elements.                                        |
+| \ref TNL::maxNorm    | Maximal norm of vector expression elements.                               |
+| \ref TNL::l1Norm     | l1 norm of vector expression elements.                                    |
+| \ref TNL::l2Norm     | l2 norm of vector expression elements.                                    |
+| \ref TNL::lpNorm     | lp norm of vector expression elements. `p` is given as second argument.   |
+| \ref TNL::product    | Product of vector expression elements.                                    |
+| \ref TNL::logicalAnd | Logical AND of vector expression elements.                                |
+| \ref TNL::logicalOr  | Logical OR of vector expression elements.                                 |
+| \ref TNL::binaryAnd  | Binary AND of vector expression elements.                                 |
+| \ref TNL::binaryOr   | Binary OR of vector expression elements.                                  |
+
 ## Static vectors
 
 Static vectors are derived from static arrays and so they are allocated on the stack and can be created in CUDA kernels as well. Their size is fixed as well and it is given by a template parameter. Static vector is a templated class defined in namespace `TNL::Containers` having two template parameters:
-- 
GitLab


From 472864c30ed2730aff735a6ed2b5e23db5a33d77 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Tue, 6 Jul 2021 14:44:28 +0200
Subject: [PATCH 17/34] Added list of vertical and horizontal operations to
 vectors tutorial.

---
 Documentation/Tutorials/Vectors/tutorial_Vectors.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Documentation/Tutorials/Vectors/tutorial_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 2184fac70..4b84e35d3 100644
--- a/Documentation/Tutorials/Vectors/tutorial_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -28,6 +28,8 @@ Output is:
 
 Vector expressions work only with `VectorView` not with `Vector`. The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like the following:
 
+| Function          | Meaning                                                     |
+|-------------------|-------------------------------------------------------------|
 | \ref TNL::min     | Minimas of input vector expressions elements.               |
 | \ref TNL::max     | Maximas of input vector expressions elements.               |
 | \ref TNL::abs     | Absolute values of input vector expression elements.        |
@@ -66,6 +68,8 @@ Output is:
 
 The following table shows vertical operations that can be used on vector expressions:
 
+| Function             | Meaning                                                                   |
+|----------------------|---------------------------------------------------------------------------|
 | \ref TNL::min        | Minimum of vector expression elements.                                    |
 | \ref TNL::argMin     | Minimum of vector expression elements with index of the smallest element. |
 | \ref TNL::max        | Maximum of vector expression elements.                                    |
-- 
GitLab


From 433d612b0024a3b8e13db7493397d51af4006fbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 10:43:22 +0200
Subject: [PATCH 18/34] Added example on distributed array.

---
 .../Examples/Containers/CMakeLists.txt        | 20 +++++++++
 .../Containers/DistributedArrayExample.cpp    | 44 +++++++++++++++++++
 2 files changed, 64 insertions(+)
 create mode 100644 Documentation/Examples/Containers/DistributedArrayExample.cpp

diff --git a/Documentation/Examples/Containers/CMakeLists.txt b/Documentation/Examples/Containers/CMakeLists.txt
index bd7d9b714..e85546a45 100644
--- a/Documentation/Examples/Containers/CMakeLists.txt
+++ b/Documentation/Examples/Containers/CMakeLists.txt
@@ -8,18 +8,38 @@ set( COMMON_EXAMPLES
          VectorExample
 )
 
+set( MPI_COMMON_EXAMPLES
+         DistributedArrayExample
+)
+
+SET( mpi_test_parameters -np 4 -H localhost:4 "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/DistributedArrayTest${CMAKE_EXECUTABLE_SUFFIX}" )
+
 if( BUILD_CUDA )
    foreach( target IN ITEMS ${COMMON_EXAMPLES} )
       cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
       add_custom_command( COMMAND ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
    endforeach()
+   if( ${BUILD_MPI} )
+      foreach( target IN ITEMS ${MPI_COMMON_EXAMPLES} )
+         cuda_add_executable( ${target}-cuda ${target}.cu OPTIONS )
+         add_custom_command( COMMAND "mpirun" ${mpi_test_parameters} ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+         set( CUDA_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+      endforeach()
+   endif( ${BUILD_MPI} )
 else()
    foreach( target IN ITEMS ${HOST_EXAMPLES} )
       add_executable( ${target} ${target}.cpp )
       add_custom_command( COMMAND ${target} > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
       set( HOST_OUTPUTS ${HOST_OUTPUTS} ${target}.out )
    endforeach()
+   if( ${BUILD_MPI} )
+      foreach( target IN ITEMS ${MPI_COMMON_EXAMPLES} )
+         add_executable( ${target} ${target}.cpp )
+         add_custom_command( COMMAND "mpirun" ${mpi_test_parameters} ${target}-cuda > ${TNL_DOCUMENTATION_OUTPUT_SNIPPETS_PATH}/${target}.out OUTPUT ${target}.out )
+         set( HOST_OUTPUTS ${CUDA_OUTPUTS} ${target}.out )
+      endforeach( ${BUILD_MPI} )
+   endif()
 endif()
 
 IF( BUILD_CUDA )
diff --git a/Documentation/Examples/Containers/DistributedArrayExample.cpp b/Documentation/Examples/Containers/DistributedArrayExample.cpp
new file mode 100644
index 000000000..97661e0c7
--- /dev/null
+++ b/Documentation/Examples/Containers/DistributedArrayExample.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include <TNL/Containers/Partitioner.h>
+#include <TNL/Containers/DistributedArray.h>
+#include <TNL/MPI/ScopedInitializer.h>
+
+using namespace TNL;
+using namespace std;
+
+/***
+ * The following works for any device (CPU, GPU ...).
+ */
+template< typename Device >
+void distributedArrayExample()
+{
+   using ArrayType = Containers::DistributedArray< int, Device >;
+   using IndexType = typename ArrayType::IndexType;
+   using LocalRangeType = typename ArrayType::LocalRangeType;
+
+   const MPI_Comm group = TNL::MPI::AllGroup();
+   const int rank = TNL::MPI::GetRank(group);
+   const int nproc = TNL::MPI::GetSize(group);
+
+   /***
+    * We set size to prime number to force non-uniform distribution of the distributed array.
+    */
+   const int size = 97;
+   const int ghosts = (nproc > 1) ? 4 : 0;
+
+   const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( size, group );
+   ArrayType a( localRange, ghosts, size, group );
+
+}
+
+int main( int argc, char* argv[] )
+{
+   TNL::MPI::ScopedInitializer mpi(argc, argv);
+
+   std::cout << "The first test runs on CPU ..." << std::endl;
+   distributedArrayExample< Devices::Host >();
+#ifdef HAVE_CUDA
+   std::cout << "The second test runs on GPU ..." << std::endl;
+   distributedArrayExample< Devices::Cuda >();
+#endif
+}
-- 
GitLab


From 884664f9ca65053fae057def39b5429b38447bc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 10:43:46 +0200
Subject: [PATCH 19/34] Fixed typo in Subrange.h.

---
 src/TNL/Containers/Subrange.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Containers/Subrange.h b/src/TNL/Containers/Subrange.h
index 17e02c45f..9f95108fc 100644
--- a/src/TNL/Containers/Subrange.h
+++ b/src/TNL/Containers/Subrange.h
@@ -21,7 +21,7 @@
 namespace TNL {
 namespace Containers {
 
-// Specifies a subrange [begin, end) of a range [0, gloablSize).
+// Specifies a subrange [begin, end) of a range [0, globalSize).
 template< typename Index >
 class Subrange
 {
-- 
GitLab


From b6326d3db7c7468c69c7c445b6b8475238740c2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 16:47:56 +0200
Subject: [PATCH 20/34] Added send and receive to Array - it does not work
 properly.

---
 src/TNL/Containers/Array.h       |  7 +++++++
 src/TNL/Containers/Array.hpp     | 20 ++++++++++++++++++++
 src/TNL/Containers/ArrayView.h   |  4 ++++
 src/TNL/Containers/ArrayView.hpp | 11 +++++++++++
 4 files changed, 42 insertions(+)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index 77c85c750..b2130da83 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -966,6 +966,13 @@ File& operator>>( File& file, Array< Value, Device, Index, Allocator >& array );
 template< typename Value, typename Device, typename Index, typename Allocator >
 File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array );
 
+template< typename Value, typename Device, typename Index, typename Allocator >
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+
+template< typename Value, typename Device, typename Index, typename Allocator >
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index 53dd302ef..7aaf2c462 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -911,5 +911,25 @@ File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array )
    return f >> array;
 }
 
+template< typename Value, typename Device, typename Index, typename Allocator >
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag, MPI_Comm comm )
+{
+   send( array.getConstView(), dest, tag, comm );
+}
+
+template< typename Value, typename Device, typename Index, typename Allocator >
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag, MPI_Comm comm )
+{
+   TNL_ASSERT( false, "Does not work" );
+   MPI_Status status;
+   Index size;
+   MPI_Recv( ( void* ) size, 1, MPI::getDataType< Index >(), src, tag, comm, &status );
+   std::cerr << "Size = " << size << std::endl;
+   array.setSize( size );
+   MPI_Recv( ( void* ) array.getData(), size * sizeof( Value ), MPI_BYTE, src, tag, comm, &status );
+
+}
+
+
 } // namespace Containers
 } // namespace TNL
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index eb7e548b0..9e81a2b68 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -767,6 +767,10 @@ File& operator>>( File& file, ArrayView< Value, Device, Index > view );
 template< typename Value, typename Device, typename Index >
 File& operator>>( File&& file, ArrayView< Value, Device, Index > view );
 
+template< typename Value, typename Device, typename Index >
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index bb4105381..944a07412 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -20,6 +20,7 @@
 #include <TNL/Containers/detail/ArrayIO.h>
 #include <TNL/Containers/detail/ArrayAssignment.h>
 #include <TNL/Allocators/Default.h>
+#include <TNL/MPI/getDataType.h>
 
 #include "ArrayView.h"
 
@@ -540,5 +541,15 @@ File& operator>>( File&& file, ArrayView< Value, Device, Index > view )
    return f >> view;
 }
 
+template< typename Value, typename Device, typename Index >
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag, MPI_Comm comm )
+{
+   TNL_ASSERT( false, "Does not work" );
+   auto size = view.getSize();
+   MPI_Send( ( const void* ) size, 1, MPI::getDataType< Index >(), dest, tag, comm );
+   MPI_Send( ( const void* ) view.getData(), view.getSize() * sizeof( Value ), MPI_BYTE, dest, tag, comm );
+}
+
+
 } // namespace Containers
 } // namespace TNL
-- 
GitLab


From 4d1533c81550da58a1ffc93bc1505e47ac2388cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 16:48:35 +0200
Subject: [PATCH 21/34] Adding operator << for DistributedArray(View).

---
 src/TNL/Containers/DistributedArray.h       | 71 +++++++++++++++++++++
 src/TNL/Containers/DistributedArray.hpp     | 24 +++++++
 src/TNL/Containers/DistributedArrayView.h   | 71 +++++++++++++++++++++
 src/TNL/Containers/DistributedArrayView.hpp | 57 +++++++++++++++++
 4 files changed, 223 insertions(+)

diff --git a/src/TNL/Containers/DistributedArray.h b/src/TNL/Containers/DistributedArray.h
index 3947bfec4..15d8eaa53 100644
--- a/src/TNL/Containers/DistributedArray.h
+++ b/src/TNL/Containers/DistributedArray.h
@@ -193,6 +193,67 @@ public:
    template< typename Array >
    bool operator!=( const Array& array ) const;
 
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f );
+
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end) for constant instances of the array.
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+
+
    // Checks if there is an element with given value in this array
    bool containsValue( ValueType value ) const;
 
@@ -215,6 +276,16 @@ private:
    static void setSynchronizerHelper( ViewType& view, const Array& array ) {}
 };
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+std::ostream& operator<<( std::ostream& str, const DistributedArray< Value, Device, Index, Allocator >& array )
+{
+   return array.getConstView().print( str );
+}
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/DistributedArray.hpp b/src/TNL/Containers/DistributedArray.hpp
index e9ee12093..dcfaeee2d 100644
--- a/src/TNL/Containers/DistributedArray.hpp
+++ b/src/TNL/Containers/DistributedArray.hpp
@@ -449,6 +449,30 @@ operator!=( const Array& array ) const
    return view != array;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Function >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+forElements( IndexType begin, IndexType end, Function&& f )
+{
+   this->view.forElements( begin, end, f );
+}
+
+template< typename Value,
+          typename Device,
+          typename Index,
+          typename Allocator >
+   template< typename Function >
+void
+DistributedArray< Value, Device, Index, Allocator >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
+{
+   this->view.forElements( begin, end, f );
+}
+
 template< typename Value,
           typename Device,
           typename Index,
diff --git a/src/TNL/Containers/DistributedArrayView.h b/src/TNL/Containers/DistributedArrayView.h
index cb3235ddb..9da306744 100644
--- a/src/TNL/Containers/DistributedArrayView.h
+++ b/src/TNL/Containers/DistributedArrayView.h
@@ -170,12 +170,73 @@ public:
    template< typename Array >
    bool operator!=( const Array& array ) const;
 
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end).
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f );
+
+      /**
+       * \brief Process the lambda function \e f for each array element in interval [ \e begin, \e end) for constant instances of the array.
+       *
+       * The lambda function is supposed to be declared as
+       *
+       * ```
+       * f( IndexType elementIdx, ValueType& elementValue )
+       * ```
+       *
+       * where
+       *
+       * - \e elementIdx is an index of the array element being currently processed
+       * - \e elementValue is a value of the array element being currently processed
+       *
+       * This is performed at the same place where the array is allocated,
+       * i.e. it is efficient even on GPU.
+       *
+       * \param begin The beginning of the array elements interval.
+       * \param end The end of the array elements interval.
+       * \param f The lambda function to be processed.
+       *
+       * \par Example
+       * \include Containers/ArrayExample_forElements.cpp
+       * \par Output
+       * \include ArrayExample_forElements.out
+       *
+       */
+      template< typename Function >
+      void forElements( IndexType begin, IndexType end, Function&& f ) const;
+
    // Checks if there is an element with given value in this array
    bool containsValue( ValueType value ) const;
 
    // Checks if all elements in this array have the same given value
    bool containsOnlyValue( ValueType value ) const;
 
+   std::ostream& print( std::ostream& str ) const;
 protected:
    LocalRangeType localRange;
    IndexType ghosts = 0;
@@ -187,6 +248,16 @@ protected:
    int valuesPerElement = 1;
 };
 
+
+template< typename Value,
+          typename Device = Devices::Host,
+          typename Index = int >
+std::ostream& operator<<( std::ostream& str, const DistributedArrayView< Value, Device, Index >& view )
+{
+   return view.print( str );
+}
+
+
 } // namespace Containers
 } // namespace TNL
 
diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 65ecc4101..1f7d3a082 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -435,6 +435,37 @@ operator!=( const Array& array ) const
    return ! (*this == array);
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+   template< typename Function >
+void
+DistributedArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f )
+{
+   IndexType localBegin = max( begin, localRange.getBegin() );
+   IndexType localEnd = min( end, localRange.getEnd() );
+   auto local_f = [=] __cuda_callable__ ( const IndexType& idx, ValueType& value ) mutable {
+      f( idx + localRange.getBegin(), value );
+   };
+   this->localData.forElements( localBegin - localRange.getBegin(),
+                                localEnd - localRange.getBegin(),
+                                local_f );
+
+}
+
+template< typename Value,
+          typename Device,
+          typename Index >
+   template< typename Function >
+void
+DistributedArrayView< Value, Device, Index >::
+forElements( IndexType begin, IndexType end, Function&& f ) const
+{
+
+}
+
+
 template< typename Value,
           typename Device,
           typename Index >
@@ -465,5 +496,31 @@ containsOnlyValue( ValueType value ) const
    return result;
 }
 
+template< typename Value,
+          typename Device,
+          typename Index >
+std::ostream&
+DistributedArrayView< Value, Device, Index >::
+print( std::ostream& str ) const
+{
+   // The following does not work properly
+   /*if( MPI::GetRank( group ) == 0 )
+   {
+      str << "[ ";
+      for( IndexType i = 0; i < localData.getSize(); i++ )
+         str << ", " << localData.getElement( i );
+      for( int proc = 1; proc < MPI::GetSize( group ); proc++ )
+      {
+         Array< std::remove_const_t< Value >, Device, Index > localArray;
+         receive( localArray, proc, 0, group );
+         for( IndexType i = 0; i < localArray.getSize(); i++ )
+            str << ", " << localArray.getElement( i );
+      }
+      str << " ]";
+   }
+   else send( this->localData, 0, 0, this->group );
+   return str;*/
+}
+
 } // namespace Containers
 } // namespace TNL
-- 
GitLab


From 1c71b26a247834b147f9d244e2b560135484ed9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 16:49:04 +0200
Subject: [PATCH 22/34] Working on example on Distributed array.

---
 .../Examples/Containers/DistributedArrayExample.cpp         | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/Examples/Containers/DistributedArrayExample.cpp b/Documentation/Examples/Containers/DistributedArrayExample.cpp
index 97661e0c7..8e191f652 100644
--- a/Documentation/Examples/Containers/DistributedArrayExample.cpp
+++ b/Documentation/Examples/Containers/DistributedArrayExample.cpp
@@ -13,11 +13,12 @@ template< typename Device >
 void distributedArrayExample()
 {
    using ArrayType = Containers::DistributedArray< int, Device >;
+   using LocalArrayType = Containers::Array< int, Device >;
    using IndexType = typename ArrayType::IndexType;
    using LocalRangeType = typename ArrayType::LocalRangeType;
 
    const MPI_Comm group = TNL::MPI::AllGroup();
-   const int rank = TNL::MPI::GetRank(group);
+   //const int rank = TNL::MPI::GetRank(group);
    const int nproc = TNL::MPI::GetSize(group);
 
    /***
@@ -28,6 +29,9 @@ void distributedArrayExample()
 
    const LocalRangeType localRange = Containers::Partitioner< IndexType >::splitRange( size, group );
    ArrayType a( localRange, ghosts, size, group );
+   a.forElements( 0, size, [=] __cuda_callable__ ( const int idx, int& value ) { value = idx; } );
+   //LocalArrayType localArray = a;
+   //std::cout << a << std::endl;
 
 }
 
-- 
GitLab


From 4f5d82cfc0dba22ab0b78a05f15445e6a2f6f8b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 18:23:01 +0200
Subject: [PATCH 23/34] Fixes for build wihtout MPI.

---
 src/TNL/Containers/Array.hpp     | 3 ++-
 src/TNL/Containers/ArrayView.h   | 1 +
 src/TNL/Containers/ArrayView.hpp | 3 ++-
 src/TNL/MPI/DummyDefs.h          | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index 7aaf2c462..86ce5580f 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -920,6 +920,7 @@ void send( const Array< Value, Device, Index, Allocator >& array, int dest, int
 template< typename Value, typename Device, typename Index, typename Allocator >
 void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag, MPI_Comm comm )
 {
+#ifdef HAVE_MPI
    TNL_ASSERT( false, "Does not work" );
    MPI_Status status;
    Index size;
@@ -927,7 +928,7 @@ void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag,
    std::cerr << "Size = " << size << std::endl;
    array.setSize( size );
    MPI_Recv( ( void* ) array.getData(), size * sizeof( Value ), MPI_BYTE, src, tag, comm, &status );
-
+#endif
 }
 
 
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 9e81a2b68..7369b0cec 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -18,6 +18,7 @@
 #include <TNL/File.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
+#include <TNL/MPI.h>
 
 namespace TNL {
 namespace Containers {
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 944a07412..8ffd5b4be 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -20,7 +20,6 @@
 #include <TNL/Containers/detail/ArrayIO.h>
 #include <TNL/Containers/detail/ArrayAssignment.h>
 #include <TNL/Allocators/Default.h>
-#include <TNL/MPI/getDataType.h>
 
 #include "ArrayView.h"
 
@@ -544,10 +543,12 @@ File& operator>>( File&& file, ArrayView< Value, Device, Index > view )
 template< typename Value, typename Device, typename Index >
 void send( const ArrayView< Value, Device, Index >& view, int dest, int tag, MPI_Comm comm )
 {
+#ifdef HAVE_MPI
    TNL_ASSERT( false, "Does not work" );
    auto size = view.getSize();
    MPI_Send( ( const void* ) size, 1, MPI::getDataType< Index >(), dest, tag, comm );
    MPI_Send( ( const void* ) view.getData(), view.getSize() * sizeof( Value ), MPI_BYTE, dest, tag, comm );
+#endif
 }
 
 
diff --git a/src/TNL/MPI/DummyDefs.h b/src/TNL/MPI/DummyDefs.h
index 578e46dfe..b61b467e1 100644
--- a/src/TNL/MPI/DummyDefs.h
+++ b/src/TNL/MPI/DummyDefs.h
@@ -47,5 +47,6 @@ enum {
 #define MPI_CART               1                       /* cartesian topology */
 #define MPI_GRAPH              2                       /* graph topology */
 #define MPI_KEYVAL_INVALID     -1                      /* invalid key value */
+#define MPI_COMM_WORLD         0
 
 #endif
-- 
GitLab


From ec45a042206643873a463d79d15dff69c170ca7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 18:36:29 +0200
Subject: [PATCH 24/34] Fixed missing return statement in DistributedArrayView.

---
 src/TNL/Containers/DistributedArrayView.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/DistributedArrayView.hpp b/src/TNL/Containers/DistributedArrayView.hpp
index 1f7d3a082..223ea99c8 100644
--- a/src/TNL/Containers/DistributedArrayView.hpp
+++ b/src/TNL/Containers/DistributedArrayView.hpp
@@ -518,8 +518,8 @@ print( std::ostream& str ) const
       }
       str << " ]";
    }
-   else send( this->localData, 0, 0, this->group );
-   return str;*/
+   else send( this->localData, 0, 0, this->group );*/
+   return str;
 }
 
 } // namespace Containers
-- 
GitLab


From fffb81951b717c4839af089db1f41a77f7cdb1a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 18:56:45 +0200
Subject: [PATCH 25/34] Fixing assertions in Array and ArrayView.

---
 src/TNL/Containers/Array.hpp     | 2 +-
 src/TNL/Containers/ArrayView.hpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Containers/Array.hpp b/src/TNL/Containers/Array.hpp
index 86ce5580f..d935840ff 100644
--- a/src/TNL/Containers/Array.hpp
+++ b/src/TNL/Containers/Array.hpp
@@ -921,7 +921,7 @@ template< typename Value, typename Device, typename Index, typename Allocator >
 void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag, MPI_Comm comm )
 {
 #ifdef HAVE_MPI
-   TNL_ASSERT( false, "Does not work" );
+   TNL_ASSERT_TRUE( false, "Does not work" );
    MPI_Status status;
    Index size;
    MPI_Recv( ( void* ) size, 1, MPI::getDataType< Index >(), src, tag, comm, &status );
diff --git a/src/TNL/Containers/ArrayView.hpp b/src/TNL/Containers/ArrayView.hpp
index 8ffd5b4be..8f6b446fe 100644
--- a/src/TNL/Containers/ArrayView.hpp
+++ b/src/TNL/Containers/ArrayView.hpp
@@ -544,7 +544,7 @@ template< typename Value, typename Device, typename Index >
 void send( const ArrayView< Value, Device, Index >& view, int dest, int tag, MPI_Comm comm )
 {
 #ifdef HAVE_MPI
-   TNL_ASSERT( false, "Does not work" );
+   TNL_ASSERT_TRUE( false, "Does not work" );
    auto size = view.getSize();
    MPI_Send( ( const void* ) size, 1, MPI::getDataType< Index >(), dest, tag, comm );
    MPI_Send( ( const void* ) view.getData(), view.getSize() * sizeof( Value ), MPI_BYTE, dest, tag, comm );
-- 
GitLab


From b8dbc55eda7aa39f107e2bd133b254861fac1ba5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Wed, 7 Jul 2021 20:14:17 +0200
Subject: [PATCH 26/34] Commenting legacy code because of nvcc complains.

---
 src/Examples/flow-sw/navierStokesProblem_impl.h   | 7 +++++--
 src/Examples/flow-vl/navierStokesProblem_impl.h   | 7 +++++--
 src/Examples/flow/navierStokesProblem_impl.h      | 7 +++++--
 src/Examples/inviscid-flow-sw/eulerProblem_impl.h | 8 ++++++--
 src/Examples/inviscid-flow-vl/eulerProblem_impl.h | 7 +++++--
 src/Examples/inviscid-flow/eulerProblem_impl.h    | 7 +++++--
 6 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/src/Examples/flow-sw/navierStokesProblem_impl.h b/src/Examples/flow-sw/navierStokesProblem_impl.h
index e42c80894..45eab8db9 100644
--- a/src/Examples/flow-sw/navierStokesProblem_impl.h
+++ b/src/Examples/flow-sw/navierStokesProblem_impl.h
@@ -177,7 +177,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -194,7 +197,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/flow-vl/navierStokesProblem_impl.h b/src/Examples/flow-vl/navierStokesProblem_impl.h
index e42c80894..45eab8db9 100644
--- a/src/Examples/flow-vl/navierStokesProblem_impl.h
+++ b/src/Examples/flow-vl/navierStokesProblem_impl.h
@@ -177,7 +177,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -194,7 +197,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/flow/navierStokesProblem_impl.h b/src/Examples/flow/navierStokesProblem_impl.h
index c4c5795c8..69c226bad 100644
--- a/src/Examples/flow/navierStokesProblem_impl.h
+++ b/src/Examples/flow/navierStokesProblem_impl.h
@@ -189,7 +189,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   /*FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -206,7 +209,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow-sw/eulerProblem_impl.h b/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
index f56fb295a..6195a18de 100644
--- a/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow-sw/eulerProblem_impl.h
@@ -174,7 +174,11 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -191,7 +195,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow-vl/eulerProblem_impl.h b/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
index f56fb295a..e921eaab2 100644
--- a/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow-vl/eulerProblem_impl.h
@@ -174,7 +174,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+   TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+   /*FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -191,7 +194,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "momentum-" );
    this->conservativeVariables->getMomentum()->write( "momentum", fileName.getFileName() );
-
+   */
    return true;
 }
 
diff --git a/src/Examples/inviscid-flow/eulerProblem_impl.h b/src/Examples/inviscid-flow/eulerProblem_impl.h
index 5a7a42d1e..50bcfbaec 100644
--- a/src/Examples/inviscid-flow/eulerProblem_impl.h
+++ b/src/Examples/inviscid-flow/eulerProblem_impl.h
@@ -175,7 +175,10 @@ makeSnapshot( const RealType& time,
   physicalVariablesGetter.getVelocity( this->conservativeVariables, this->velocity );
   physicalVariablesGetter.getPressure( this->conservativeVariables, this->gamma, this->pressure );
 
-   FileName fileName;
+  TNL_ASSERT_TRUE( false, "The following does not work." );
+   // nvcc 10.1.243
+   // TNL/Functions/VectorField.h|404| error #2986: cannot use an entity undefined in device code
+  /* FileName fileName;
    fileName.setExtension( "vti" );
    fileName.setIndex( step );
    fileName.setFileNameBase( "density-" );
@@ -189,7 +192,7 @@ makeSnapshot( const RealType& time,
 
    fileName.setFileNameBase( "energy-" );
    this->conservativeVariables->getEnergy()->write( "energy", fileName.getFileName() );
-
+   */
    return true;
 }
 
-- 
GitLab


From 5cdc0678ac07939d115c0f5d3c3e59c76cbb1e22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 7 Jul 2021 19:41:10 +0000
Subject: [PATCH 27/34] Apply 1 suggestion(s) to 1 file(s)

---
 src/TNL/Algorithms/Reduction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index f90c6934d..08f38153c 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -130,7 +130,7 @@ auto reduce( const Index begin,
                                                end,
                                                std::forward< Fetch >( fetch ),
                                                std::forward< Reduce >( reduce ),
-                                               std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
+                                               reduce.template getIdempotent< Result >() );
 }
 
 /**
-- 
GitLab


From 84b7a28b0f100011cf8fabe9e68c60f6559dd71d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 7 Jul 2021 19:41:21 +0000
Subject: [PATCH 28/34] Apply 1 suggestion(s) to 1 file(s)

---
 src/TNL/Algorithms/Reduction.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index 08f38153c..e1684fdfa 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -257,7 +257,7 @@ reduceWithArgument( const Index begin,
                                                            end,
                                                            std::forward< Fetch >( fetch ),
                                                            std::forward< Reduce >( reduce ),
-                                                           std::remove_reference< Reduce >::type::template getIdempotent< Result >() );
+                                                           reduce.template getIdempotent< Result >() );
 }
 
    } // namespace Algorithms
-- 
GitLab


From f84b88b42e8c8a86727636abc14f128e50e98e1f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 7 Jul 2021 19:41:39 +0000
Subject: [PATCH 29/34] Apply 1 suggestion(s) to 1 file(s)

---
 src/TNL/Functional.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index 57d325889..b7a23992d 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -17,7 +17,7 @@
 namespace TNL {
 
 /**
- * \brief Extension of std::plus for use with \ref TNL::Algorithms::reduce.
+ * \brief Extension of \ref std::plus<void> for use with \ref TNL::Algorithms::reduce.
  *
  * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
-- 
GitLab


From 8f965848c003574d5967d41dadcf799adaafd7a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 7 Jul 2021 19:42:16 +0000
Subject: [PATCH 30/34] Apply 1 suggestion(s) to 1 file(s)

---
 src/TNL/Functional.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/TNL/Functional.h b/src/TNL/Functional.h
index b7a23992d..d87e078e3 100644
--- a/src/TNL/Functional.h
+++ b/src/TNL/Functional.h
@@ -19,7 +19,6 @@ namespace TNL {
 /**
  * \brief Extension of \ref std::plus<void> for use with \ref TNL::Algorithms::reduce.
  *
- * This is specialization for void type. The real type is deduced just when operator() is evoked.
  */
 struct Plus : public std::plus< void >
 {
-- 
GitLab


From 16bb531678891b7e3fd1157fff0aa628c4f5e92c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 8 Jul 2021 12:41:05 +0200
Subject: [PATCH 31/34] Fixing tutorial on vectors.

---
 .../Tutorials/Vectors/Expressions.cpp         |  6 +-
 .../Tutorials/Vectors/tutorial_Vectors.md     | 92 ++++++++++---------
 2 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/Documentation/Tutorials/Vectors/Expressions.cpp b/Documentation/Tutorials/Vectors/Expressions.cpp
index b41d2188e..b261b4470 100644
--- a/Documentation/Tutorials/Vectors/Expressions.cpp
+++ b/Documentation/Tutorials/Vectors/Expressions.cpp
@@ -10,16 +10,12 @@ void expressions()
 {
    using RealType = float;
    using VectorType = Vector< RealType, Device >;
-   using ViewType = VectorView< RealType, Device >;
 
    /****
     * Create vectors
     */
    const int size = 11;
-   VectorType a_v( size ), b_v( size ), c_v( size );
-   ViewType a = a_v.getView();
-   ViewType b = b_v.getView();
-   ViewType c = c_v.getView();
+   VectorType a( size ), b( size ), c( size );
    a.forAllElements( [] __cuda_callable__ ( int i, RealType& value ) { value = 3.14 * ( i - 5.0 ) / 5.0; } );
    b = a * a;
    c = 3 * a + sign( a ) * sin( a );
diff --git a/Documentation/Tutorials/Vectors/tutorial_Vectors.md b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
index 4b84e35d3..5cd3b60f1 100644
--- a/Documentation/Tutorials/Vectors/tutorial_Vectors.md
+++ b/Documentation/Tutorials/Vectors/tutorial_Vectors.md
@@ -26,35 +26,37 @@ Output is:
 
 \include Expressions.out
 
-Vector expressions work only with `VectorView` not with `Vector`. The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like the following:
-
-| Function          | Meaning                                                     |
-|-------------------|-------------------------------------------------------------|
-| \ref TNL::min     | Minimas of input vector expressions elements.               |
-| \ref TNL::max     | Maximas of input vector expressions elements.               |
-| \ref TNL::abs     | Absolute values of input vector expression elements.        |
-| \ref TNL::sin     | Sine of input vector expression elements.                   |
-| \ref TNL::cos     | Cosine of input vector expression elements.                 |
-| \ref TNL::tan     | Tangent of input vector expression elements.                |
-| \ref TNL::asin    | Arc sine of input vector expression elements.               |
-| \ref TNL::acos    | Arc cosine of input vector expression elements.             |
-| \ref TNL::atan    | Arc tangent of input vector expression elements.            |
-| \ref TNL::sinh    | Hyperbolic sine of input vector expression elements.        |
-| \ref TNL::cosh    | Hyperbolic cosine of input vector expression elements.      |
-| \ref TNL::tanh    | Hyperbolic tangent of input vector expression elements.     |
-| \ref TNL::asinh   | Arc hyperbolic sine of input vector expression elements.    |
-| \ref TNL::acosh   | Arc hyperbolic cosine of input vector expression elements.  |
-| \ref TNL::atanh   | Arc hyperbolic tangent of input vector expression elements. |
-| \ref TNL::exp     | Exponential function of input vector expression elements.   |
-| \ref TNL::log     | Natural logarithm of input vector expression elements.      |
-| \ref TNL::log10   | Decadic logarithm of input vector expression elements.      |
-| \ref TNL::log2    | Binary logarithm of input vector expression elements.       |
-| \ref TNL::sqrt    | Square root of input vector expression elements.            |
-| \ref TNL::cbrt    | Cubic root of input vector expression elements.             |
-| \ref TNL::pow     | Power of of input vector expression elements.               |
-| \ref TNL::floor   | Rounds downward input vector expression elements.           |
-| \ref TNL::ceil    | Rounds upward of input vector expression elements.          |
-| \ref TNL::sign    | Signum of input vector expression elements.                 |
+The expression is evaluated on the same device where the vectors are allocated, this is done automatically. One cannot, however, mix vectors from different devices in one expression. Vector expression may contain any common function like the following:
+
+| Expression                     | Meaning                                                     |
+|--------------------------------|-------------------------------------------------------------|
+| `v = TNL::min( expr1, expr2 )` |  `v[ i ] = min( expr1[ i ], expr2[ i ] )`                   |
+| `v = TNL::max( expr1, expr2 )` |  `v[ i ] = max( expr1[ i ], expr2[ i ] )`                   |
+| `v =  TNL::abs( expr )`        |  `v[ i ] = abs( expr[ i ] )`                                |
+| `v =  TNL::sin( expr )`        |  `v[ i ] = sin( expr[ i ] )`                                |
+| `v =  TNL::cos( expr )`        |  `v[ i ] = cos( expr[ i ] )`                                |
+| `v =  TNL::tan( expr )`        |  `v[ i ] = tan( expr[ i ] )`                                |
+| `v =  TNL::asin( expr )`       |  `v[ i ] = asin( expr[ i ] )`                               |
+| `v =  TNL::acos( expr )`       |  `v[ i ] = acos( expr[ i ] )`                               |
+| `v =  TNL::atan( expr )`       |  `v[ i ] = atan( expr[ i ] )`                               |
+| `v =  TNL::sinh( expr )`       |  `v[ i ] = sinh( expr[ i ] )`                               |
+| `v =  TNL::cosh( expr )`       |  `v[ i ] = cosh( expr[ i ] )`                               |
+| `v =  TNL::tanh( expr )`       |  `v[ i ] = tanh( expr[ i ] )`                               |
+| `v =  TNL::asinh( expr )`      |  `v[ i ] = asinh( expr[ i ] )`                              |
+| `v =  TNL::acosh( expr )`      |  `v[ i ] = acosh( expr[ i ] )`                              |
+| `v =  TNL::atanh( expr )`      |  `v[ i ] = atanh( expr[ i ] )`                              |
+| `v =  TNL::exp( expr )`        |  `v[ i ] = exp( expr[ i ] )`                                |
+| `v =  TNL::log( expr )`        |  `v[ i ] = log( expr[ i ] )`                                |
+| `v =  TNL::log10( expr )`      |  `v[ i ] = log10( expr[ i ] )`                              |
+| `v =  TNL::log2( expr )`       |  `v[ i ] = log2( expr[ i ] )`                               |
+| `v =  TNL::sqrt( expr )`       |  `v[ i ] = sqrt( expr[ i ] )`                               |
+| `v =  TNL::cbrt( expr )`       |  `v[ i ] = cbrt( expr[ i ] )`                               |
+| `v =  TNL::pow( expr )`        |  `v[ i ] = pow( expr[ i ] )`                                |
+| `v =  TNL::floor( expr )`      |  `v[ i ] = floor( expr[ i ] )`                              |
+| `v =  TNL::ceil( expr )`       |  `v[ i ] = ceil( expr[ i ] )`                               |
+| `v =  TNL::sign( expr )`       |  `v[ i ] = sign( expr[ i ] )`                               |
+
+Where `v` is a result vector and `expr`, `expr1` and `expr2` are vector expressions. Vector expressions can be combined with vector views (\ref TNL::Containers::VectorView) as well.
 
 ### Vertical operations
 
@@ -68,22 +70,22 @@ Output is:
 
 The following table shows vertical operations that can be used on vector expressions:
 
-| Function             | Meaning                                                                   |
-|----------------------|---------------------------------------------------------------------------|
-| \ref TNL::min        | Minimum of vector expression elements.                                    |
-| \ref TNL::argMin     | Minimum of vector expression elements with index of the smallest element. |
-| \ref TNL::max        | Maximum of vector expression elements.                                    |
-| \ref TNL::argMax     | Minimum of vector expression elements with index of the smallest element. |
-| \ref TNL::sum        | Sum of vector expression elements.                                        |
-| \ref TNL::maxNorm    | Maximal norm of vector expression elements.                               |
-| \ref TNL::l1Norm     | l1 norm of vector expression elements.                                    |
-| \ref TNL::l2Norm     | l2 norm of vector expression elements.                                    |
-| \ref TNL::lpNorm     | lp norm of vector expression elements. `p` is given as second argument.   |
-| \ref TNL::product    | Product of vector expression elements.                                    |
-| \ref TNL::logicalAnd | Logical AND of vector expression elements.                                |
-| \ref TNL::logicalOr  | Logical OR of vector expression elements.                                 |
-| \ref TNL::binaryAnd  | Binary AND of vector expression elements.                                 |
-| \ref TNL::binaryOr   | Binary OR of vector expression elements.                                  |
+| Expression                                   | Meaning                                                                                            |
+|----------------------------------------------|----------------------------------------------------------------------------------------------------|
+| `v =  TNL::min( expr )`                      | `v` is minimum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                       |
+| `std::pair( v, i ) =  TNL::argMin( expr )`   | `v` is minimum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`, `i` is index of the smallest element. |
+| `v =  TNL::max( expr )`                      | `v` is maximum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                       |
+| `std::pair( v, i ) =  TNL::argMax( expr )`   | `v` is maximum of `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`, `i` is index of the largest element.  |
+| `v =  TNL::sum( expr )`                      | `v` is sum of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                          |
+| `v =  TNL::maxNorm( expr )`                  | `v` is maximal norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                 |
+| `v =  TNL::l1Norm( expr )`                   | `v` is l1 norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::l2Norm( expr )`                   | `v` is l2 norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::lpNorm( expr, p )`                | `v` is lp norm of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::product( expr )`                  | `v` is product of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                      |
+| `v =  TNL::logicalAnd( expr )`               | `v` is logical AND of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                  |
+| `v =  TNL::logicalOr( expr )`                | `v` is logical OR of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                   |
+| `v =  TNL::binaryAnd( expr )`                | `v` is binary AND of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                   |
+| `v =  TNL::binaryOr( expr )`                 | `v` is binary OR of  `expr[ 0 ], expr[ 1 ] , .... expr[ n-1 ]`.                                    |
 
 ## Static vectors
 
-- 
GitLab


From e5a0a816babed4e4547c65228ecd92cefb85537a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 8 Jul 2021 12:44:57 +0200
Subject: [PATCH 32/34] Fixing comments in Reduction.h

---
 src/TNL/Algorithms/Reduction.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index f90c6934d..89316f4a8 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -13,9 +13,9 @@
 #pragma once
 
 #include <utility>  // std::pair
-#include <functional>        // reduction functions like std::plus, std::logical_and, std::logical_or etc. - deprecated
+#include <functional>        // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
-#include <TNL/Functional.h>  // replacement of STL functional
+#include <TNL/Functional.h>  // modification of STL functionals made more suitable reduction
 #include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
-- 
GitLab


From 9410e049d259ec7f9583ca4b4d82f0d45b39f223 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 8 Jul 2021 13:02:18 +0200
Subject: [PATCH 33/34] Erasing MPI_COMM_WORLD, using AllGroup instead.

---
 src/TNL/Containers/Array.h     | 4 ++--
 src/TNL/Containers/ArrayView.h | 2 +-
 src/TNL/MPI/DummyDefs.h        | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index b2130da83..e59f60eb4 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -967,10 +967,10 @@ template< typename Value, typename Device, typename Index, typename Allocator >
 File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array );
 
 template< typename Value, typename Device, typename Index, typename Allocator >
-void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup );
 
 template< typename Value, typename Device, typename Index, typename Allocator >
-void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI::AllGroup );
 
 
 } // namespace Containers
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 7369b0cec..7b03738a0 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -769,7 +769,7 @@ template< typename Value, typename Device, typename Index >
 File& operator>>( File&& file, ArrayView< Value, Device, Index > view );
 
 template< typename Value, typename Device, typename Index >
-void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI_COMM_WORLD );
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup );
 
 
 } // namespace Containers
diff --git a/src/TNL/MPI/DummyDefs.h b/src/TNL/MPI/DummyDefs.h
index b61b467e1..578e46dfe 100644
--- a/src/TNL/MPI/DummyDefs.h
+++ b/src/TNL/MPI/DummyDefs.h
@@ -47,6 +47,5 @@ enum {
 #define MPI_CART               1                       /* cartesian topology */
 #define MPI_GRAPH              2                       /* graph topology */
 #define MPI_KEYVAL_INVALID     -1                      /* invalid key value */
-#define MPI_COMM_WORLD         0
 
 #endif
-- 
GitLab


From 2d2309cd868a9216b80690e63f395989cad8dae6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com>
Date: Thu, 8 Jul 2021 14:10:04 +0200
Subject: [PATCH 34/34] Fixing use of AllGroup.

---
 src/TNL/Containers/Array.h     | 4 ++--
 src/TNL/Containers/ArrayView.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/TNL/Containers/Array.h b/src/TNL/Containers/Array.h
index e59f60eb4..f2c9ca705 100644
--- a/src/TNL/Containers/Array.h
+++ b/src/TNL/Containers/Array.h
@@ -967,10 +967,10 @@ template< typename Value, typename Device, typename Index, typename Allocator >
 File& operator>>( File&& file, Array< Value, Device, Index, Allocator >& array );
 
 template< typename Value, typename Device, typename Index, typename Allocator >
-void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup );
+void send( const Array< Value, Device, Index, Allocator >& array, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
 
 template< typename Value, typename Device, typename Index, typename Allocator >
-void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI::AllGroup );
+void receive( Array< Value, Device, Index, Allocator >& array, int src, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
 
 
 } // namespace Containers
diff --git a/src/TNL/Containers/ArrayView.h b/src/TNL/Containers/ArrayView.h
index 7b03738a0..31743c1f6 100644
--- a/src/TNL/Containers/ArrayView.h
+++ b/src/TNL/Containers/ArrayView.h
@@ -769,7 +769,7 @@ template< typename Value, typename Device, typename Index >
 File& operator>>( File&& file, ArrayView< Value, Device, Index > view );
 
 template< typename Value, typename Device, typename Index >
-void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup );
+void send( const ArrayView< Value, Device, Index >& view, int dest, int tag = 0, MPI_Comm comm = MPI::AllGroup() );
 
 
 } // namespace Containers
-- 
GitLab