From 7756e2d0c69479158f6bd9a76f4d6694e199acd1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz>
Date: Wed, 4 Sep 2019 17:17:14 +0200
Subject: [PATCH] Added Devices::Sequential and corresponding specializations
 in TNL::Algorithms

---
 src/TNL/Algorithms/CudaReductionKernel.h      |   6 +-
 src/TNL/Algorithms/MemoryOperations.h         |   6 +-
 src/TNL/Algorithms/MemoryOperationsHost.hpp   |   6 +-
 .../Algorithms/MemoryOperationsSequential.hpp |  16 +-
 src/TNL/Algorithms/Multireduction.h           |  30 +++
 src/TNL/Algorithms/Multireduction.hpp         | 145 +++++++-----
 src/TNL/Algorithms/ParallelFor.h              | 107 +++++----
 src/TNL/Algorithms/Reduction.h                |  25 ++
 src/TNL/Algorithms/Reduction.hpp              | 216 ++++++++++--------
 src/TNL/Algorithms/Scan.h                     | 109 ++++++++-
 src/TNL/Algorithms/Scan.hpp                   | 133 ++++++++---
 src/TNL/Allocators/Default.h                  |   9 +
 src/TNL/Containers/NDArray.h                  |   4 +-
 src/TNL/Devices/Host.h                        |   1 -
 src/TNL/Devices/Sequential.h                  |  21 ++
 15 files changed, 583 insertions(+), 251 deletions(-)
 create mode 100644 src/TNL/Devices/Sequential.h

diff --git a/src/TNL/Algorithms/CudaReductionKernel.h b/src/TNL/Algorithms/CudaReductionKernel.h
index c1004a3742..b97295e000 100644
--- a/src/TNL/Algorithms/CudaReductionKernel.h
+++ b/src/TNL/Algorithms/CudaReductionKernel.h
@@ -351,7 +351,7 @@ struct CudaReductionKernelLauncher
 
       // Copy result on CPU
       Result result;
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result, output, 1 );
       return result;
    }
 
@@ -384,8 +384,8 @@ struct CudaReductionKernelLauncher
       ////
       // Copy result on CPU
       std::pair< Index, Result > result;
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.second, output, 1 );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 );
       return result;
    }
 
diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h
index cdbdb79098..59da324028 100644
--- a/src/TNL/Algorithms/MemoryOperations.h
+++ b/src/TNL/Algorithms/MemoryOperations.h
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Cuda/CudaCallable.h>
@@ -17,12 +18,11 @@
 namespace TNL {
 namespace Algorithms {
 
-template< typename DestinationExecution >
+template< typename DestinationDevice >
 struct MemoryOperations;
 
-// TODO: change "void" to "Execution::Sequential"
 template<>
-struct MemoryOperations< void >
+struct MemoryOperations< Devices::Sequential >
 {
    template< typename Element >
    __cuda_callable__
diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp
index a886886856..cc85975f55 100644
--- a/src/TNL/Algorithms/MemoryOperationsHost.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp
@@ -93,7 +93,7 @@ copyFromIterator( DestinationElement* destination,
                   SourceIterator first,
                   SourceIterator last )
 {
-   MemoryOperations< void >::copyFromIterator( destination, destinationSize, first, last );
+   MemoryOperations< Devices::Sequential >::copyFromIterator( destination, destinationSize, first, last );
 }
 
 template< typename DestinationElement,
@@ -137,7 +137,7 @@ containsValue( const Element* data,
    }
    else {
       // sequential algorithm can return as soon as it finds a match
-      return MemoryOperations< void >::containsValue( data, size, value );
+      return MemoryOperations< Devices::Sequential >::containsValue( data, size, value );
    }
 }
 
@@ -159,7 +159,7 @@ containsOnlyValue( const Element* data,
    }
    else {
       // sequential algorithm can return as soon as it finds a mismatch
-      return MemoryOperations< void >::containsOnlyValue( data, size, value );
+      return MemoryOperations< Devices::Sequential >::containsOnlyValue( data, size, value );
    }
 }
 
diff --git a/src/TNL/Algorithms/MemoryOperationsSequential.hpp b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
index e427f00dd5..9e5ad25b13 100644
--- a/src/TNL/Algorithms/MemoryOperationsSequential.hpp
+++ b/src/TNL/Algorithms/MemoryOperationsSequential.hpp
@@ -18,7 +18,7 @@ namespace Algorithms {
 template< typename Element >
 __cuda_callable__
 void
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 setElement( Element* data,
             const Element& value )
 {
@@ -28,7 +28,7 @@ setElement( Element* data,
 template< typename Element >
 __cuda_callable__
 Element
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 getElement( const Element* data )
 {
    return *data;
@@ -37,7 +37,7 @@ getElement( const Element* data )
 template< typename Element, typename Index >
 __cuda_callable__
 void
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 set( Element* data,
      const Element& value,
      const Index size )
@@ -51,7 +51,7 @@ template< typename DestinationElement,
           typename Index >
 __cuda_callable__
 void
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 copy( DestinationElement* destination,
       const SourceElement* source,
       const Index size )
@@ -64,7 +64,7 @@ template< typename DestinationElement,
           typename Index,
           typename SourceIterator >
 void
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 copyFromIterator( DestinationElement* destination,
                   Index destinationSize,
                   SourceIterator first,
@@ -82,7 +82,7 @@ template< typename Element1,
           typename Index >
 __cuda_callable__
 bool
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 compare( const Element1* destination,
          const Element2* source,
          const Index size )
@@ -97,7 +97,7 @@ template< typename Element,
           typename Index >
 __cuda_callable__
 bool
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 containsValue( const Element* data,
                const Index size,
                const Element& value )
@@ -116,7 +116,7 @@ template< typename Element,
           typename Index >
 __cuda_callable__
 bool
-MemoryOperations< void >::
+MemoryOperations< Devices::Sequential >::
 containsOnlyValue( const Element* data,
                    const Index size,
                    const Element& value )
diff --git a/src/TNL/Algorithms/Multireduction.h b/src/TNL/Algorithms/Multireduction.h
index ac67255feb..8e63fa7eab 100644
--- a/src/TNL/Algorithms/Multireduction.h
+++ b/src/TNL/Algorithms/Multireduction.h
@@ -14,6 +14,7 @@
 
 #include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -23,6 +24,35 @@ namespace Algorithms {
 template< typename Device >
 struct Multireduction;
 
+template<>
+struct Multireduction< Devices::Sequential >
+{
+   /**
+    * Parameters:
+    *    zero: starting value for reduction
+    *    dataFetcher: callable object such that `dataFetcher( i, j )` yields
+    *                 the i-th value to be reduced from the j-th dataset
+    *                 (i = 0,...,size-1; j = 0,...,n-1)
+    *    reduction: callable object representing the reduction operation
+    *               for example, it can be an instance of std::plus, std::logical_and,
+    *               std::logical_or etc.
+    *    size: the size of each dataset
+    *    n: number of datasets to be reduced
+    *    result: output array of size = n
+    */
+   template< typename Result,
+             typename DataFetcher,
+             typename Reduction,
+             typename Index >
+   static constexpr void
+   reduce( const Result zero,
+           DataFetcher dataFetcher,
+           const Reduction reduction,
+           const Index size,
+           const int n,
+           Result* result );
+};
+
 template<>
 struct Multireduction< Devices::Host >
 {
diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp
index 25b91f0267..0bfead2871 100644
--- a/src/TNL/Algorithms/Multireduction.hpp
+++ b/src/TNL/Algorithms/Multireduction.hpp
@@ -29,6 +29,83 @@
 namespace TNL {
 namespace Algorithms {
 
+template< typename Result,
+          typename DataFetcher,
+          typename Reduction,
+          typename Index >
+void constexpr
+Multireduction< Devices::Sequential >::
+reduce( const Result zero,
+        DataFetcher dataFetcher,
+        const Reduction reduction,
+        const Index size,
+        const int n,
+        Result* result )
+{
+   TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
+   TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
+
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+   if( blocks > 1 ) {
+      // initialize array for unrolled results
+      // (it is accessed as a row-major matrix with n rows and 4 columns)
+      Result r[ n * 4 ];
+      for( int k = 0; k < n * 4; k++ )
+         r[ k ] = zero;
+
+      // main reduction (explicitly unrolled loop)
+      for( int b = 0; b < blocks; b++ ) {
+         const Index offset = b * block_size;
+         for( int k = 0; k < n; k++ ) {
+            Result* _r = r + 4 * k;
+            for( int i = 0; i < block_size; i += 4 ) {
+               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
+               _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
+               _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
+               _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
+            }
+         }
+      }
+
+      // reduction of the last, incomplete block (not unrolled)
+      for( int k = 0; k < n; k++ ) {
+         Result* _r = r + 4 * k;
+         for( Index i = blocks * block_size; i < size; i++ )
+            _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
+      }
+
+      // reduction of unrolled results
+      for( int k = 0; k < n; k++ ) {
+         Result* _r = r + 4 * k;
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
+         _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
+
+         // copy the result into the output parameter
+         result[ k ] = _r[ 0 ];
+      }
+   }
+   else {
+      for( int k = 0; k < n; k++ )
+         result[ k ] = zero;
+
+      for( int b = 0; b < blocks; b++ ) {
+         const Index offset = b * block_size;
+         for( int k = 0; k < n; k++ ) {
+            for( int i = 0; i < block_size; i++ )
+               result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
+         }
+      }
+
+      for( int k = 0; k < n; k++ ) {
+         for( Index i = blocks * block_size; i < size; i++ )
+            result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
+      }
+   }
+}
+
 template< typename Result,
           typename DataFetcher,
           typename Reduction,
@@ -45,10 +122,10 @@ reduce( const Result zero,
    TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
    TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
 
+#ifdef HAVE_OPENMP
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
-#ifdef HAVE_OPENMP
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
 #pragma omp parallel num_threads(threads)
@@ -106,67 +183,9 @@ reduce( const Result zero,
          }
       }
    }
-   else {
-#endif
-      if( blocks > 1 ) {
-         // initialize array for unrolled results
-         // (it is accessed as a row-major matrix with n rows and 4 columns)
-         Result r[ n * 4 ];
-         for( int k = 0; k < n * 4; k++ )
-            r[ k ] = zero;
-
-         // main reduction (explicitly unrolled loop)
-         for( int b = 0; b < blocks; b++ ) {
-            const Index offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               Result* _r = r + 4 * k;
-               for( int i = 0; i < block_size; i += 4 ) {
-                  _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i,     k ) );
-                  _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
-                  _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
-                  _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
-               }
-            }
-         }
-
-         // reduction of the last, incomplete block (not unrolled)
-         for( int k = 0; k < n; k++ ) {
-            Result* _r = r + 4 * k;
-            for( Index i = blocks * block_size; i < size; i++ )
-               _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
-         }
-
-         // reduction of unrolled results
-         for( int k = 0; k < n; k++ ) {
-            Result* _r = r + 4 * k;
-            _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
-            _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
-            _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
-
-            // copy the result into the output parameter
-            result[ k ] = _r[ 0 ];
-         }
-      }
-      else {
-         for( int k = 0; k < n; k++ )
-            result[ k ] = zero;
-
-         for( int b = 0; b < blocks; b++ ) {
-            const Index offset = b * block_size;
-            for( int k = 0; k < n; k++ ) {
-               for( int i = 0; i < block_size; i++ )
-                  result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
-            }
-         }
-
-         for( int k = 0; k < n; k++ ) {
-            for( Index i = blocks * block_size; i < size; i++ )
-               result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
-         }
-      }
-#ifdef HAVE_OPENMP
-   }
+   else
 #endif
+      Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
 }
 
 template< typename Result,
@@ -204,7 +223,7 @@ reduce( const Result zero,
 
    // transfer the reduced data from device to host
    std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
-   MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
+   MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
@@ -215,7 +234,7 @@ reduce( const Result zero,
 
    // finish the reduction on the host
    auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
-   Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
+   Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
 
    #ifdef CUDA_REDUCTION_PROFILING
       timer.stop();
diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h
index 20e87f2220..6d5e917ba4 100644
--- a/src/TNL/Algorithms/ParallelFor.h
+++ b/src/TNL/Algorithms/ParallelFor.h
@@ -10,11 +10,13 @@
 
 #pragma once
 
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 #include <TNL/Cuda/CheckDevice.h>
 #include <TNL/Cuda/DeviceInfo.h>
 #include <TNL/Cuda/LaunchHelpers.h>
+#include <TNL/Exceptions/CudaSupportMissing.h>
 #include <TNL/Math.h>
 
 /****
@@ -33,9 +35,53 @@ namespace Algorithms {
 
 enum ParallelForMode { SynchronousMode, AsynchronousMode };
 
-template< typename Device = Devices::Host,
+template< typename Device = Devices::Sequential,
           ParallelForMode Mode = SynchronousMode >
 struct ParallelFor
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index start, Index end, Function f, FunctionArgs... args )
+   {
+      for( Index i = start; i < end; i++ )
+         f( i, args... );
+   }
+};
+
+template< typename Device = Devices::Sequential,
+          ParallelForMode Mode = SynchronousMode >
+struct ParallelFor2D
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
+   {
+      for( Index j = startY; j < endY; j++ )
+      for( Index i = startX; i < endX; i++ )
+         f( i, j, args... );
+   }
+};
+
+template< typename Device = Devices::Sequential,
+          ParallelForMode Mode = SynchronousMode >
+struct ParallelFor3D
+{
+   template< typename Index,
+             typename Function,
+             typename... FunctionArgs >
+   static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
+   {
+      for( Index k = startZ; k < endZ; k++ )
+      for( Index j = startY; j < endY; j++ )
+      for( Index i = startX; i < endX; i++ )
+         f( i, j, k, args... );
+   }
+};
+
+template< ParallelForMode Mode >
+struct ParallelFor< Devices::Host, Mode >
 {
    template< typename Index,
              typename Function,
@@ -44,26 +90,23 @@ struct ParallelFor
    {
 #ifdef HAVE_OPENMP
       // Benchmarks show that this is significantly faster compared
-      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
-      if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
+      // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() && end - start > 512 )'
+      if( Devices::Host::isOMPEnabled() && end - start > 512 )
       {
-#pragma omp parallel for
+         #pragma omp parallel for
          for( Index i = start; i < end; i++ )
             f( i, args... );
       }
       else
-         for( Index i = start; i < end; i++ )
-            f( i, args... );
+         ParallelFor< Devices::Sequential >::exec( start, end, f, args... );
 #else
-      for( Index i = start; i < end; i++ )
-         f( i, args... );
+      ParallelFor< Devices::Sequential >::exec( start, end, f, args... );
 #endif
    }
 };
 
-template< typename Device = Devices::Host,
-          ParallelForMode Mode = SynchronousMode >
-struct ParallelFor2D
+template< ParallelForMode Mode >
+struct ParallelFor2D< Devices::Host, Mode >
 {
    template< typename Index,
              typename Function,
@@ -72,30 +115,24 @@ struct ParallelFor2D
    {
 #ifdef HAVE_OPENMP
       // Benchmarks show that this is significantly faster compared
-      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
-      if( TNL::Devices::Host::isOMPEnabled() )
+      // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )'
+      if( Devices::Host::isOMPEnabled() )
       {
-#pragma omp parallel for
-         for( Index j = startY; j < endY; j++ )
-         for( Index i = startX; i < endX; i++ )
-            f( i, j, args... );
-      }
-      else {
+         #pragma omp parallel for
          for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
             f( i, j, args... );
       }
+      else
+         ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... );
 #else
-      for( Index j = startY; j < endY; j++ )
-      for( Index i = startX; i < endX; i++ )
-         f( i, j, args... );
+      ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... );
 #endif
    }
 };
 
-template< typename Device = Devices::Host,
-          ParallelForMode Mode = SynchronousMode >
-struct ParallelFor3D
+template< ParallelForMode Mode >
+struct ParallelFor3D< Devices::Host, Mode >
 {
    template< typename Index,
              typename Function,
@@ -104,27 +141,19 @@ struct ParallelFor3D
    {
 #ifdef HAVE_OPENMP
       // Benchmarks show that this is significantly faster compared
-      // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
-     if( TNL::Devices::Host::isOMPEnabled() )
-     {
-#pragma omp parallel for collapse(2)
-      for( Index k = startZ; k < endZ; k++ )
-      for( Index j = startY; j < endY; j++ )
-      for( Index i = startX; i < endX; i++ )
-         f( i, j, k, args... );
-      }
-      else
+      // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )'
+      if( Devices::Host::isOMPEnabled() )
       {
+         #pragma omp parallel for collapse(2)
          for( Index k = startZ; k < endZ; k++ )
          for( Index j = startY; j < endY; j++ )
          for( Index i = startX; i < endX; i++ )
             f( i, j, k, args... );
       }
+      else
+         ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... );
 #else
-      for( Index k = startZ; k < endZ; k++ )
-      for( Index j = startY; j < endY; j++ )
-      for( Index i = startX; i < endX; i++ )
-         f( i, j, k, args... );
+      ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... );
 #endif
    }
 };
diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h
index e77fa12064..c0d62684d5 100644
--- a/src/TNL/Algorithms/Reduction.h
+++ b/src/TNL/Algorithms/Reduction.h
@@ -15,6 +15,7 @@
 #include <utility>  // std::pair
 #include <functional>  // reduction functions like std::plus, std::logical_and, std::logical_or etc.
 
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -36,6 +37,30 @@ namespace Algorithms {
 template< typename Device >
 struct Reduction;
 
+template<>
+struct Reduction< Devices::Sequential >
+{
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static constexpr Result
+   reduce( const Index size,
+           const ReductionOperation& reduction,
+           DataFetcher& dataFetcher,
+           const Result& zero );
+
+   template< typename Index,
+             typename Result,
+             typename ReductionOperation,
+             typename DataFetcher >
+   static constexpr std::pair< Index, Result >
+   reduceWithArgument( const Index size,
+                       const ReductionOperation& reduction,
+                       DataFetcher& dataFetcher,
+                       const Result& zero );
+};
+
 template<>
 struct Reduction< Devices::Host >
 {
diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/Reduction.hpp
index 9fd56576eb..b07f04445e 100644
--- a/src/TNL/Algorithms/Reduction.hpp
+++ b/src/TNL/Algorithms/Reduction.hpp
@@ -17,8 +17,8 @@
 //#define CUDA_REDUCTION_PROFILING
 
 #include <TNL/Algorithms/Reduction.h>
-#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 #include <TNL/Algorithms/CudaReductionKernel.h>
+#include <TNL/Algorithms/MultiDeviceMemoryOperations.h>
 
 #ifdef CUDA_REDUCTION_PROFILING
 #include <iostream>
@@ -35,8 +35,115 @@ namespace Algorithms {
  */
 static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256;
 
-////
-// Reduction on host
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename DataFetcher >
+constexpr Result
+Reduction< Devices::Sequential >::
+reduce( const Index size,
+        const ReductionOperation& reduction,
+        DataFetcher& dataFetcher,
+        const Result& zero )
+{
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+   if( blocks > 1 ) {
+      // initialize array for unrolled results
+      Result r[ 4 ] = { zero, zero, zero, zero };
+
+      // main reduction (explicitly unrolled loop)
+      for( int b = 0; b < blocks; b++ ) {
+         const Index offset = b * block_size;
+         for( int i = 0; i < block_size; i += 4 ) {
+            r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
+            r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
+            r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
+            r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
+         }
+      }
+
+      // reduction of the last, incomplete block (not unrolled)
+      for( Index i = blocks * block_size; i < size; i++ )
+         r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
+
+      // reduction of unrolled results
+      r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
+      r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
+      r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
+      return r[ 0 ];
+   }
+   else {
+      Result result = zero;
+      for( Index i = 0; i < size; i++ )
+         result = reduction( result, dataFetcher( i ) );
+      return result;
+   }
+}
+
+template< typename Index,
+          typename Result,
+          typename ReductionOperation,
+          typename DataFetcher >
+constexpr std::pair< Index, Result >
+Reduction< Devices::Sequential >::
+reduceWithArgument( const Index size,
+                    const ReductionOperation& reduction,
+                    DataFetcher& dataFetcher,
+                    const Result& zero )
+{
+   constexpr int block_size = 128;
+   const int blocks = size / block_size;
+
+   if( blocks > 1 ) {
+      // initialize array for unrolled results
+      Index arg[ 4 ] = { 0, 0, 0, 0 };
+      Result r[ 4 ] = { zero, zero, zero, zero };
+      bool initialized( false );
+
+      // main reduction (explicitly unrolled loop)
+      for( int b = 0; b < blocks; b++ ) {
+         const Index offset = b * block_size;
+         for( int i = 0; i < block_size; i += 4 ) {
+            if( ! initialized )
+            {
+               arg[ 0 ] = offset + i;
+               arg[ 1 ] = offset + i + 1;
+               arg[ 2 ] = offset + i + 2;
+               arg[ 3 ] = offset + i + 3;
+               r[ 0 ] = dataFetcher( offset + i );
+               r[ 1 ] = dataFetcher( offset + i + 1 );
+               r[ 2 ] = dataFetcher( offset + i + 2 );
+               r[ 3 ] = dataFetcher( offset + i + 3 );
+               initialized = true;
+               continue;
+            }
+            reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
+            reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
+            reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
+            reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
+         }
+      }
+
+      // reduction of the last, incomplete block (not unrolled)
+      for( Index i = blocks * block_size; i < size; i++ )
+         reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
+
+      // reduction of unrolled results
+      reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
+      reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
+      reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
+      return std::make_pair( arg[ 0 ], r[ 0 ] );
+   }
+   else {
+      std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
+      for( Index i = 1; i < size; i++ )
+         reduction( result.first, i, result.second, dataFetcher( i ) );
+      return result;
+   }
+}
+
 template< typename Index,
           typename Result,
           typename ReductionOperation,
@@ -48,10 +155,10 @@ reduce( const Index size,
         DataFetcher& dataFetcher,
         const Result& zero )
 {
+#ifdef HAVE_OPENMP
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
-#ifdef HAVE_OPENMP
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
       Result result = zero;
@@ -92,42 +199,9 @@ reduce( const Index size,
       }
       return result;
    }
-   else {
-#endif
-      if( blocks > 1 ) {
-         // initialize array for unrolled results
-         Result r[ 4 ] = { zero, zero, zero, zero };
-
-         // main reduction (explicitly unrolled loop)
-         for( int b = 0; b < blocks; b++ ) {
-            const Index offset = b * block_size;
-            for( int i = 0; i < block_size; i += 4 ) {
-               r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) );
-               r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) );
-               r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) );
-               r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) );
-            }
-         }
-
-         // reduction of the last, incomplete block (not unrolled)
-         for( Index i = blocks * block_size; i < size; i++ )
-            r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) );
-
-         // reduction of unrolled results
-         r[ 0 ] = reduction( r[ 0 ], r[ 2 ] );
-         r[ 1 ] = reduction( r[ 1 ], r[ 3 ] );
-         r[ 0 ] = reduction( r[ 0 ], r[ 1 ] );
-         return r[ 0 ];
-      }
-      else {
-         Result result = zero;
-         for( Index i = 0; i < size; i++ )
-            result = reduction( result, dataFetcher( i ) );
-         return result;
-      }
-#ifdef HAVE_OPENMP
-   }
+   else
 #endif
+      return Reduction< Devices::Sequential >::reduce( size, reduction, dataFetcher, zero );
 }
 
 template< typename Index,
@@ -141,10 +215,10 @@ reduceWithArgument( const Index size,
                     DataFetcher& dataFetcher,
                     const Result& zero )
 {
+#ifdef HAVE_OPENMP
    constexpr int block_size = 128;
    const int blocks = size / block_size;
 
-#ifdef HAVE_OPENMP
    if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
       // global result variable
       std::pair< Index, Result > result( -1, zero );
@@ -201,57 +275,9 @@ reduceWithArgument( const Index size,
       }
       return result;
    }
-   else {
-#endif
-      if( blocks > 1 ) {
-         // initialize array for unrolled results
-         Index arg[ 4 ] = { 0, 0, 0, 0 };
-         Result r[ 4 ] = { zero, zero, zero, zero };
-         bool initialized( false );
-
-         // main reduction (explicitly unrolled loop)
-         for( int b = 0; b < blocks; b++ ) {
-            const Index offset = b * block_size;
-            for( int i = 0; i < block_size; i += 4 ) {
-               if( ! initialized )
-               {
-                  arg[ 0 ] = offset + i;
-                  arg[ 1 ] = offset + i + 1;
-                  arg[ 2 ] = offset + i + 2;
-                  arg[ 3 ] = offset + i + 3;
-                  r[ 0 ] = dataFetcher( offset + i );
-                  r[ 1 ] = dataFetcher( offset + i + 1 );
-                  r[ 2 ] = dataFetcher( offset + i + 2 );
-                  r[ 3 ] = dataFetcher( offset + i + 3 );
-                  initialized = true;
-                  continue;
-               }
-               reduction( arg[ 0 ], offset + i,     r[ 0 ], dataFetcher( offset + i ) );
-               reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) );
-               reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) );
-               reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) );
-            }
-         }
-
-         // reduction of the last, incomplete block (not unrolled)
-         for( Index i = blocks * block_size; i < size; i++ )
-            reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) );
-
-         // reduction of unrolled results
-         reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] );
-         reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] );
-         reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] );
-         return std::make_pair( arg[ 0 ], r[ 0 ] );
-      }
-      else {
-         std::pair< Index, Result > result( 0, dataFetcher( 0 ) );
-         for( Index i = 1; i < size; i++ )
-            reduction( result.first, i, result.second, dataFetcher( i ) );
-         return result;
-      }
-#ifdef HAVE_OPENMP
-   }
+   else
 #endif
+      return Reduction< Devices::Sequential >::reduceWithArgument( size, reduction, dataFetcher, zero );
 }
 
 template< typename Index,
@@ -309,7 +335,7 @@ reduce( const Index size,
          new Result[ reducedSize ]
          #endif
       };
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -320,7 +346,7 @@ reduce( const Index size,
 
       // finish the reduction on the host
       auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-      const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero );
+      const Result result = Reduction< Devices::Sequential >::reduce( reducedSize, reduction, fetch, zero );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -414,8 +440,8 @@ reduceWithArgument( const Index size,
          new Index[ reducedSize ]
          #endif
       };
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
-      MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize );
+      MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize );
 
       #ifdef CUDA_REDUCTION_PROFILING
          timer.stop();
@@ -426,7 +452,7 @@ reduceWithArgument( const Index size,
 
       // finish the reduction on the host
 //      auto fetch = [&] ( Index i ) { return resultArray[ i ]; };
-//      const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
+//      const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero );
       for( Index i = 1; i < reducedSize; i++ )
          reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] );
 
diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h
index 2f2275c53f..81a5d2f7e7 100644
--- a/src/TNL/Algorithms/Scan.h
+++ b/src/TNL/Algorithms/Scan.h
@@ -12,6 +12,7 @@
 
 #pragma once
 
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -96,11 +97,71 @@ template< typename Device,
 struct SegmentedScan;
 
 
+template< ScanType Type >
+struct Scan< Devices::Sequential, Type >
+{
+   /**
+    * \brief Computes scan (prefix sum) sequentially.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/ScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include ScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction >
+   static void
+   perform( Vector& v,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename Reduction >
+   static auto
+   performFirstPhase( Vector& v,
+                      const typename Vector::IndexType begin,
+                      const typename Vector::IndexType end,
+                      const Reduction& reduction,
+                      const typename Vector::RealType zero );
+
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+   static void
+   performSecondPhase( Vector& v,
+                       const BlockShifts& blockShifts,
+                       const typename Vector::IndexType begin,
+                       const typename Vector::IndexType end,
+                       const Reduction& reduction,
+                       const typename Vector::RealType shift );
+};
+
 template< ScanType Type >
 struct Scan< Devices::Host, Type >
 {
    /**
-    * \brief Computes scan (prefix sum) on CPU.
+    * \brief Computes scan (prefix sum) using OpenMP.
     *
     * \tparam Vector type vector being used for the scan.
     * \tparam Reduction lambda function defining the reduction operation
@@ -216,11 +277,55 @@ struct Scan< Devices::Cuda, Type >
                        const typename Vector::RealType shift );
 };
 
+template< ScanType Type >
+struct SegmentedScan< Devices::Sequential, Type >
+{
+   /**
+    * \brief Computes segmented scan (prefix sum) sequentially.
+    *
+    * \tparam Vector type vector being used for the scan.
+    * \tparam Reduction lambda function defining the reduction operation
+    * \tparam Flags array type containing zeros and ones defining the segments begining
+    *
+    * \param v input vector, the result of scan is stored in the same vector
+    * \param flags is an array with zeros and ones defining the segments begining
+    * \param begin the first element in the array to be scanned
+    * \param end the last element in the array to be scanned
+    * \param reduction lambda function implementing the reduction operation
+    * \param zero is the idempotent element for the reduction operation, i.e. element which
+    *             does not change the result of the reduction.
+    *
+    * The reduction lambda function takes two variables which are supposed to be reduced:
+    *
+    * ```
+    * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
+    * ```
+    *
+    * \par Example
+    *
+    * \include ReductionAndScan/SegmentedScanExample.cpp
+    *
+    * \par Output
+    *
+    * \include SegmentedScanExample.out
+    */
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+   static void
+   perform( Vector& v,
+            Flags& flags,
+            const typename Vector::IndexType begin,
+            const typename Vector::IndexType end,
+            const Reduction& reduction,
+            const typename Vector::RealType zero );
+};
+
 template< ScanType Type >
 struct SegmentedScan< Devices::Host, Type >
 {
    /**
-    * \brief Computes segmented scan (prefix sum) on CPU.
+    * \brief Computes segmented scan (prefix sum) using OpenMP.
     *
     * \tparam Vector type vector being used for the scan.
     * \tparam Reduction lambda function defining the reduction operation
diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp
index bb2288c6ac..7b6d31ece5 100644
--- a/src/TNL/Algorithms/Scan.hpp
+++ b/src/TNL/Algorithms/Scan.hpp
@@ -24,6 +24,78 @@
 namespace TNL {
 namespace Algorithms {
 
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction >
+void
+Scan< Devices::Sequential, Type >::
+perform( Vector& v,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
+{
+   // sequential prefix-sum does not need a second phase
+   performFirstPhase( v, begin, end, reduction, zero );
+}
+
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction >
+auto
+Scan< Devices::Sequential, Type >::
+performFirstPhase( Vector& v,
+                   const typename Vector::IndexType begin,
+                   const typename Vector::IndexType end,
+                   const Reduction& reduction,
+                   const typename Vector::RealType zero )
+{
+   using RealType = typename Vector::RealType;
+   using IndexType = typename Vector::IndexType;
+
+   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
+//   return Containers::StaticArray< 1, RealType > block_sums;
+   Containers::Array< RealType, Devices::Host > block_sums( 1 );
+   block_sums[ 0 ] = zero;
+
+   if( Type == ScanType::Inclusive ) {
+      for( IndexType i = begin + 1; i < end; i++ )
+         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
+      block_sums[ 0 ] = v[ end - 1 ];
+   }
+   else // Exclusive prefix sum
+   {
+      RealType aux = zero;
+      for( IndexType i = begin; i < end; i++ ) {
+         const RealType x = v[ i ];
+         v[ i ] = aux;
+         aux = reduction( aux, x );
+      }
+      block_sums[ 0 ] = aux;
+   }
+
+   return block_sums;
+}
+
+template< ScanType Type >
+   template< typename Vector,
+             typename BlockShifts,
+             typename Reduction >
+void
+Scan< Devices::Sequential, Type >::
+performSecondPhase( Vector& v,
+                    const BlockShifts& blockShifts,
+                    const typename Vector::IndexType begin,
+                    const typename Vector::IndexType end,
+                    const Reduction& reduction,
+                    const typename Vector::RealType shift )
+{
+   using IndexType = typename Vector::IndexType;
+
+   for( IndexType i = begin; i < end; i++ )
+      v[ i ] = reduction( v[ i ], shift );
+}
+
 template< ScanType Type >
    template< typename Vector,
              typename Reduction >
@@ -39,8 +111,7 @@ perform( Vector& v,
    const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero );
    performSecondPhase( v, blockShifts, begin, end, reduction, zero );
 #else
-   // sequential prefix-sum does not need a second phase
-   performFirstPhase( v, begin, end, reduction, zero );
+   Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero );
 #endif
 }
 
@@ -55,12 +126,12 @@ performFirstPhase( Vector& v,
                    const Reduction& reduction,
                    const typename Vector::RealType zero )
 {
+#ifdef HAVE_OPENMP
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-#ifdef HAVE_OPENMP
    const int threads = Devices::Host::getMaxThreadsCount();
-   Containers::Array< RealType, Devices::Host > block_sums( threads + 1 );
+   Containers::Array< RealType > block_sums( threads + 1 );
    block_sums[ 0 ] = zero;
 
    #pragma omp parallel num_threads(threads)
@@ -98,28 +169,7 @@ performFirstPhase( Vector& v,
    // block_sums now contains shift values for each block - to be used in the second phase
    return block_sums;
 #else
-   // FIXME: StaticArray does not have getElement() which is used in DistributedScan
-//   return Containers::StaticArray< 1, RealType > block_sums;
-   Containers::Array< RealType, Devices::Host > block_sums( 1 );
-   block_sums[ 0 ] = zero;
-
-   if( Type == ScanType::Inclusive ) {
-      for( IndexType i = begin + 1; i < end; i++ )
-         v[ i ] = reduction( v[ i ], v[ i - 1 ] );
-      block_sums[ 0 ] = v[ end - 1 ];
-   }
-   else // Exclusive prefix sum
-   {
-      RealType aux = zero;
-      for( IndexType i = begin; i < end; i++ ) {
-         const RealType x = v[ i ];
-         v[ i ] = aux;
-         aux = reduction( aux, x );
-      }
-      block_sums[ 0 ] = aux;
-   }
-
-   return block_sums;
+   return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero );
 #endif
 }
 
@@ -136,10 +186,10 @@ performSecondPhase( Vector& v,
                     const Reduction& reduction,
                     const typename Vector::RealType shift )
 {
+#ifdef HAVE_OPENMP
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-#ifdef HAVE_OPENMP
    const int threads = blockShifts.getSize() - 1;
 
    // launch exactly the same number of threads as in the first phase
@@ -154,8 +204,7 @@ performSecondPhase( Vector& v,
          v[ i ] = reduction( v[ i ], offset );
    }
 #else
-   for( IndexType i = begin; i < end; i++ )
-      v[ i ] = reduction( v[ i ], shift );
+   Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, shift );
 #endif
 }
 
@@ -245,7 +294,7 @@ template< ScanType Type >
              typename Reduction,
              typename Flags >
 void
-SegmentedScan< Devices::Host, Type >::
+SegmentedScan< Devices::Sequential, Type >::
 perform( Vector& v,
          Flags& flags,
          const typename Vector::IndexType begin,
@@ -256,7 +305,6 @@ perform( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   // TODO: parallelize with OpenMP
    if( Type == ScanType::Inclusive )
    {
       for( IndexType i = begin + 1; i < end; i++ )
@@ -278,6 +326,27 @@ perform( Vector& v,
    }
 }
 
+template< ScanType Type >
+   template< typename Vector,
+             typename Reduction,
+             typename Flags >
+void
+SegmentedScan< Devices::Host, Type >::
+perform( Vector& v,
+         Flags& flags,
+         const typename Vector::IndexType begin,
+         const typename Vector::IndexType end,
+         const Reduction& reduction,
+         const typename Vector::RealType zero )
+{
+#ifdef HAVE_OPENMP
+   // TODO: parallelize with OpenMP
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+#else
+   SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero );
+#endif
+}
+
 template< ScanType Type >
    template< typename Vector,
              typename Reduction,
@@ -295,7 +364,7 @@ perform( Vector& v,
    using RealType = typename Vector::RealType;
    using IndexType = typename Vector::IndexType;
 
-   throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." );
+   throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." );
 #else
    throw Exceptions::CudaSupportMissing();
 #endif
diff --git a/src/TNL/Allocators/Default.h b/src/TNL/Allocators/Default.h
index eed5c193b9..109539d0c9 100644
--- a/src/TNL/Allocators/Default.h
+++ b/src/TNL/Allocators/Default.h
@@ -14,6 +14,7 @@
 
 #include <TNL/Allocators/Host.h>
 #include <TNL/Allocators/Cuda.h>
+#include <TNL/Devices/Sequential.h>
 #include <TNL/Devices/Host.h>
 #include <TNL/Devices/Cuda.h>
 
@@ -27,6 +28,14 @@ namespace Allocators {
 template< typename Device >
 struct Default;
 
+//! Sets \ref Allocators::Host as the default allocator for \ref Devices::Sequential.
+template<>
+struct Default< Devices::Sequential >
+{
+   template< typename T >
+   using Allocator = Allocators::Host< T >;
+};
+
 //! Sets \ref Allocators::Host as the default allocator for \ref Devices::Host.
 template<>
 struct Default< Devices::Host >
diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h
index 8472f4d715..3cbc8a7bc1 100644
--- a/src/TNL/Containers/NDArray.h
+++ b/src/TNL/Containers/NDArray.h
@@ -352,13 +352,13 @@ class StaticNDArray
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
-                         void >
+                         Devices::Sequential >
 {
    using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >,
                          SizesHolder,
                          Permutation,
                          __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >,
-                         void >;
+                         Devices::Sequential >;
    static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0,
                   "All dimensions of a static array must to be positive." );
 
diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h
index 1156075834..4af7892ecc 100644
--- a/src/TNL/Devices/Host.h
+++ b/src/TNL/Devices/Host.h
@@ -19,7 +19,6 @@
 #endif
 
 namespace TNL {
-//! \brief Namespace for TNL execution models
 namespace Devices {
 
 class Host
diff --git a/src/TNL/Devices/Sequential.h b/src/TNL/Devices/Sequential.h
new file mode 100644
index 0000000000..f00660f196
--- /dev/null
+++ b/src/TNL/Devices/Sequential.h
@@ -0,0 +1,21 @@
+/***************************************************************************
+                          Sequential.h  -  description
+                             -------------------
+    begin                : Aug 17, 2019
+    copyright            : (C) 2019 by Tomas Oberhuber et al.
+    email                : tomas.oberhuber@fjfi.cvut.cz
+ ***************************************************************************/
+
+/* See Copyright Notice in tnl/Copyright */
+
+#pragma once
+
+namespace TNL {
+//! \brief Namespace for TNL execution models
+namespace Devices {
+
+struct Sequential
+{};
+
+} // namespace Devices
+} // namespace TNL
-- 
GitLab