From 7756e2d0c69479158f6bd9a76f4d6694e199acd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Klinkovsk=C3=BD?= <klinkovsky@mmg.fjfi.cvut.cz> Date: Wed, 4 Sep 2019 17:17:14 +0200 Subject: [PATCH] Added Devices::Sequential and corresponding specializations in TNL::Algorithms --- src/TNL/Algorithms/CudaReductionKernel.h | 6 +- src/TNL/Algorithms/MemoryOperations.h | 6 +- src/TNL/Algorithms/MemoryOperationsHost.hpp | 6 +- .../Algorithms/MemoryOperationsSequential.hpp | 16 +- src/TNL/Algorithms/Multireduction.h | 30 +++ src/TNL/Algorithms/Multireduction.hpp | 145 +++++++----- src/TNL/Algorithms/ParallelFor.h | 107 +++++---- src/TNL/Algorithms/Reduction.h | 25 ++ src/TNL/Algorithms/Reduction.hpp | 216 ++++++++++-------- src/TNL/Algorithms/Scan.h | 109 ++++++++- src/TNL/Algorithms/Scan.hpp | 133 ++++++++--- src/TNL/Allocators/Default.h | 9 + src/TNL/Containers/NDArray.h | 4 +- src/TNL/Devices/Host.h | 1 - src/TNL/Devices/Sequential.h | 21 ++ 15 files changed, 583 insertions(+), 251 deletions(-) create mode 100644 src/TNL/Devices/Sequential.h diff --git a/src/TNL/Algorithms/CudaReductionKernel.h b/src/TNL/Algorithms/CudaReductionKernel.h index c1004a3742..b97295e000 100644 --- a/src/TNL/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Algorithms/CudaReductionKernel.h @@ -351,7 +351,7 @@ struct CudaReductionKernelLauncher // Copy result on CPU Result result; - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result, output, 1 ); return result; } @@ -384,8 +384,8 @@ struct CudaReductionKernelLauncher //// // Copy result on CPU std::pair< Index, Result > result; - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.first, idxOutput, 1 ); - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.second, output, 1 ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 ); return result; } diff --git a/src/TNL/Algorithms/MemoryOperations.h b/src/TNL/Algorithms/MemoryOperations.h index cdbdb79098..59da324028 100644 --- a/src/TNL/Algorithms/MemoryOperations.h +++ b/src/TNL/Algorithms/MemoryOperations.h @@ -10,6 +10,7 @@ #pragma once +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Cuda/CudaCallable.h> @@ -17,12 +18,11 @@ namespace TNL { namespace Algorithms { -template< typename DestinationExecution > +template< typename DestinationDevice > struct MemoryOperations; -// TODO: change "void" to "Execution::Sequential" template<> -struct MemoryOperations< void > +struct MemoryOperations< Devices::Sequential > { template< typename Element > __cuda_callable__ diff --git a/src/TNL/Algorithms/MemoryOperationsHost.hpp b/src/TNL/Algorithms/MemoryOperationsHost.hpp index a886886856..cc85975f55 100644 --- a/src/TNL/Algorithms/MemoryOperationsHost.hpp +++ b/src/TNL/Algorithms/MemoryOperationsHost.hpp @@ -93,7 +93,7 @@ copyFromIterator( DestinationElement* destination, SourceIterator first, SourceIterator last ) { - MemoryOperations< void >::copyFromIterator( destination, destinationSize, first, last ); + MemoryOperations< Devices::Sequential >::copyFromIterator( destination, destinationSize, first, last ); } template< typename DestinationElement, @@ -137,7 +137,7 @@ containsValue( const Element* data, } else { // sequential algorithm can return as soon as it finds a match - return MemoryOperations< void >::containsValue( data, size, value ); + return MemoryOperations< Devices::Sequential >::containsValue( data, size, value ); } } @@ -159,7 +159,7 @@ containsOnlyValue( const Element* data, } else { // sequential algorithm can return as soon as it finds a mismatch - return MemoryOperations< void >::containsOnlyValue( data, size, value ); + return MemoryOperations< Devices::Sequential >::containsOnlyValue( data, size, value ); } } diff --git a/src/TNL/Algorithms/MemoryOperationsSequential.hpp b/src/TNL/Algorithms/MemoryOperationsSequential.hpp index e427f00dd5..9e5ad25b13 100644 --- a/src/TNL/Algorithms/MemoryOperationsSequential.hpp +++ b/src/TNL/Algorithms/MemoryOperationsSequential.hpp @@ -18,7 +18,7 @@ namespace Algorithms { template< typename Element > __cuda_callable__ void -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: setElement( Element* data, const Element& value ) { @@ -28,7 +28,7 @@ setElement( Element* data, template< typename Element > __cuda_callable__ Element -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: getElement( const Element* data ) { return *data; @@ -37,7 +37,7 @@ getElement( const Element* data ) template< typename Element, typename Index > __cuda_callable__ void -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: set( Element* data, const Element& value, const Index size ) @@ -51,7 +51,7 @@ template< typename DestinationElement, typename Index > __cuda_callable__ void -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: copy( DestinationElement* destination, const SourceElement* source, const Index size ) @@ -64,7 +64,7 @@ template< typename DestinationElement, typename Index, typename SourceIterator > void -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: copyFromIterator( DestinationElement* destination, Index destinationSize, SourceIterator first, @@ -82,7 +82,7 @@ template< typename Element1, typename Index > __cuda_callable__ bool -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: compare( const Element1* destination, const Element2* source, const Index size ) @@ -97,7 +97,7 @@ template< typename Element, typename Index > __cuda_callable__ bool -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: containsValue( const Element* data, const Index size, const Element& value ) @@ -116,7 +116,7 @@ template< typename Element, typename Index > __cuda_callable__ bool -MemoryOperations< void >:: +MemoryOperations< Devices::Sequential >:: containsOnlyValue( const Element* data, const Index size, const Element& value ) diff --git a/src/TNL/Algorithms/Multireduction.h b/src/TNL/Algorithms/Multireduction.h index ac67255feb..8e63fa7eab 100644 --- a/src/TNL/Algorithms/Multireduction.h +++ b/src/TNL/Algorithms/Multireduction.h @@ -14,6 +14,7 @@ #include <functional> // reduction functions like std::plus, std::logical_and, std::logical_or etc. +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -23,6 +24,35 @@ namespace Algorithms { template< typename Device > struct Multireduction; +template<> +struct Multireduction< Devices::Sequential > +{ + /** + * Parameters: + * zero: starting value for reduction + * dataFetcher: callable object such that `dataFetcher( i, j )` yields + * the i-th value to be reduced from the j-th dataset + * (i = 0,...,size-1; j = 0,...,n-1) + * reduction: callable object representing the reduction operation + * for example, it can be an instance of std::plus, std::logical_and, + * std::logical_or etc. + * size: the size of each dataset + * n: number of datasets to be reduced + * result: output array of size = n + */ + template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > + static constexpr void + reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const Index size, + const int n, + Result* result ); +}; + template<> struct Multireduction< Devices::Host > { diff --git a/src/TNL/Algorithms/Multireduction.hpp b/src/TNL/Algorithms/Multireduction.hpp index 25b91f0267..0bfead2871 100644 --- a/src/TNL/Algorithms/Multireduction.hpp +++ b/src/TNL/Algorithms/Multireduction.hpp @@ -29,6 +29,83 @@ namespace TNL { namespace Algorithms { +template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > +void constexpr +Multireduction< Devices::Sequential >:: +reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const Index size, + const int n, + Result* result ) +{ + TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); + TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); + + constexpr int block_size = 128; + const int blocks = size / block_size; + + if( blocks > 1 ) { + // initialize array for unrolled results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } + } + } + + // reduction of the last, incomplete block (not unrolled) + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + + // reduction of unrolled results + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); + + // copy the result into the output parameter + result[ k ] = _r[ 0 ]; + } + } + else { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + for( int i = 0; i < block_size; i++ ) + result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) ); + } + } + + for( int k = 0; k < n; k++ ) { + for( Index i = blocks * block_size; i < size; i++ ) + result[ k ] = reduction( result[ k ], dataFetcher( i, k ) ); + } + } +} + template< typename Result, typename DataFetcher, typename Reduction, @@ -45,10 +122,10 @@ reduce( const Result zero, TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); +#ifdef HAVE_OPENMP constexpr int block_size = 128; const int blocks = size / block_size; -#ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); #pragma omp parallel num_threads(threads) @@ -106,67 +183,9 @@ reduce( const Result zero, } } } - else { -#endif - if( blocks > 1 ) { - // initialize array for unrolled results - // (it is accessed as a row-major matrix with n rows and 4 columns) - Result r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = zero; - - // main reduction (explicitly unrolled loop) - for( int b = 0; b < blocks; b++ ) { - const Index offset = b * block_size; - for( int k = 0; k < n; k++ ) { - Result* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); - _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); - _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); - _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); - } - } - } - - // reduction of the last, incomplete block (not unrolled) - for( int k = 0; k < n; k++ ) { - Result* _r = r + 4 * k; - for( Index i = blocks * block_size; i < size; i++ ) - _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); - } - - // reduction of unrolled results - for( int k = 0; k < n; k++ ) { - Result* _r = r + 4 * k; - _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); - _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); - _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); - - // copy the result into the output parameter - result[ k ] = _r[ 0 ]; - } - } - else { - for( int k = 0; k < n; k++ ) - result[ k ] = zero; - - for( int b = 0; b < blocks; b++ ) { - const Index offset = b * block_size; - for( int k = 0; k < n; k++ ) { - for( int i = 0; i < block_size; i++ ) - result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) ); - } - } - - for( int k = 0; k < n; k++ ) { - for( Index i = blocks * block_size; i < size; i++ ) - result[ k ] = reduction( result[ k ], dataFetcher( i, k ) ); - } - } -#ifdef HAVE_OPENMP - } + else #endif + Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result ); } template< typename Result, @@ -204,7 +223,7 @@ reduce( const Result zero, // transfer the reduced data from device to host std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] }; - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -215,7 +234,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; - Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); + Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); diff --git a/src/TNL/Algorithms/ParallelFor.h b/src/TNL/Algorithms/ParallelFor.h index 20e87f2220..6d5e917ba4 100644 --- a/src/TNL/Algorithms/ParallelFor.h +++ b/src/TNL/Algorithms/ParallelFor.h @@ -10,11 +10,13 @@ #pragma once +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Cuda/CheckDevice.h> #include <TNL/Cuda/DeviceInfo.h> #include <TNL/Cuda/LaunchHelpers.h> +#include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Math.h> /**** @@ -33,9 +35,53 @@ namespace Algorithms { enum ParallelForMode { SynchronousMode, AsynchronousMode }; -template< typename Device = Devices::Host, +template< typename Device = Devices::Sequential, ParallelForMode Mode = SynchronousMode > struct ParallelFor +{ + template< typename Index, + typename Function, + typename... FunctionArgs > + static void exec( Index start, Index end, Function f, FunctionArgs... args ) + { + for( Index i = start; i < end; i++ ) + f( i, args... ); + } +}; + +template< typename Device = Devices::Sequential, + ParallelForMode Mode = SynchronousMode > +struct ParallelFor2D +{ + template< typename Index, + typename Function, + typename... FunctionArgs > + static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args ) + { + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, args... ); + } +}; + +template< typename Device = Devices::Sequential, + ParallelForMode Mode = SynchronousMode > +struct ParallelFor3D +{ + template< typename Index, + typename Function, + typename... FunctionArgs > + static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args ) + { + for( Index k = startZ; k < endZ; k++ ) + for( Index j = startY; j < endY; j++ ) + for( Index i = startX; i < endX; i++ ) + f( i, j, k, args... ); + } +}; + +template< ParallelForMode Mode > +struct ParallelFor< Devices::Host, Mode > { template< typename Index, typename Function, @@ -44,26 +90,23 @@ struct ParallelFor { #ifdef HAVE_OPENMP // Benchmarks show that this is significantly faster compared - // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )' - if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 ) + // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() && end - start > 512 )' + if( Devices::Host::isOMPEnabled() && end - start > 512 ) { -#pragma omp parallel for + #pragma omp parallel for for( Index i = start; i < end; i++ ) f( i, args... ); } else - for( Index i = start; i < end; i++ ) - f( i, args... ); + ParallelFor< Devices::Sequential >::exec( start, end, f, args... ); #else - for( Index i = start; i < end; i++ ) - f( i, args... ); + ParallelFor< Devices::Sequential >::exec( start, end, f, args... ); #endif } }; -template< typename Device = Devices::Host, - ParallelForMode Mode = SynchronousMode > -struct ParallelFor2D +template< ParallelForMode Mode > +struct ParallelFor2D< Devices::Host, Mode > { template< typename Index, typename Function, @@ -72,30 +115,24 @@ struct ParallelFor2D { #ifdef HAVE_OPENMP // Benchmarks show that this is significantly faster compared - // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' - if( TNL::Devices::Host::isOMPEnabled() ) + // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )' + if( Devices::Host::isOMPEnabled() ) { -#pragma omp parallel for - for( Index j = startY; j < endY; j++ ) - for( Index i = startX; i < endX; i++ ) - f( i, j, args... ); - } - else { + #pragma omp parallel for for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) f( i, j, args... ); } + else + ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... ); #else - for( Index j = startY; j < endY; j++ ) - for( Index i = startX; i < endX; i++ ) - f( i, j, args... ); + ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... ); #endif } }; -template< typename Device = Devices::Host, - ParallelForMode Mode = SynchronousMode > -struct ParallelFor3D +template< ParallelForMode Mode > +struct ParallelFor3D< Devices::Host, Mode > { template< typename Index, typename Function, @@ -104,27 +141,19 @@ struct ParallelFor3D { #ifdef HAVE_OPENMP // Benchmarks show that this is significantly faster compared - // to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )' - if( TNL::Devices::Host::isOMPEnabled() ) - { -#pragma omp parallel for collapse(2) - for( Index k = startZ; k < endZ; k++ ) - for( Index j = startY; j < endY; j++ ) - for( Index i = startX; i < endX; i++ ) - f( i, j, k, args... ); - } - else + // to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )' + if( Devices::Host::isOMPEnabled() ) { + #pragma omp parallel for collapse(2) for( Index k = startZ; k < endZ; k++ ) for( Index j = startY; j < endY; j++ ) for( Index i = startX; i < endX; i++ ) f( i, j, k, args... ); } + else + ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... ); #else - for( Index k = startZ; k < endZ; k++ ) - for( Index j = startY; j < endY; j++ ) - for( Index i = startX; i < endX; i++ ) - f( i, j, k, args... ); + ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... ); #endif } }; diff --git a/src/TNL/Algorithms/Reduction.h b/src/TNL/Algorithms/Reduction.h index e77fa12064..c0d62684d5 100644 --- a/src/TNL/Algorithms/Reduction.h +++ b/src/TNL/Algorithms/Reduction.h @@ -15,6 +15,7 @@ #include <utility> // std::pair #include <functional> // reduction functions like std::plus, std::logical_and, std::logical_or etc. +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -36,6 +37,30 @@ namespace Algorithms { template< typename Device > struct Reduction; +template<> +struct Reduction< Devices::Sequential > +{ + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static constexpr Result + reduce( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); + + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static constexpr std::pair< Index, Result > + reduceWithArgument( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); +}; + template<> struct Reduction< Devices::Host > { diff --git a/src/TNL/Algorithms/Reduction.hpp b/src/TNL/Algorithms/Reduction.hpp index 9fd56576eb..b07f04445e 100644 --- a/src/TNL/Algorithms/Reduction.hpp +++ b/src/TNL/Algorithms/Reduction.hpp @@ -17,8 +17,8 @@ //#define CUDA_REDUCTION_PROFILING #include <TNL/Algorithms/Reduction.h> -#include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #include <TNL/Algorithms/CudaReductionKernel.h> +#include <TNL/Algorithms/MultiDeviceMemoryOperations.h> #ifdef CUDA_REDUCTION_PROFILING #include <iostream> @@ -35,8 +35,115 @@ namespace Algorithms { */ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; -//// -// Reduction on host +template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > +constexpr Result +Reduction< Devices::Sequential >:: +reduce( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ) +{ + constexpr int block_size = 128; + const int blocks = size / block_size; + + if( blocks > 1 ) { + // initialize array for unrolled results + Result r[ 4 ] = { zero, zero, zero, zero }; + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int i = 0; i < block_size; i += 4 ) { + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + } + } + + // reduction of the last, incomplete block (not unrolled) + for( Index i = blocks * block_size; i < size; i++ ) + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); + + // reduction of unrolled results + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); + return r[ 0 ]; + } + else { + Result result = zero; + for( Index i = 0; i < size; i++ ) + result = reduction( result, dataFetcher( i ) ); + return result; + } +} + +template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > +constexpr std::pair< Index, Result > +Reduction< Devices::Sequential >:: +reduceWithArgument( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ) +{ + constexpr int block_size = 128; + const int blocks = size / block_size; + + if( blocks > 1 ) { + // initialize array for unrolled results + Index arg[ 4 ] = { 0, 0, 0, 0 }; + Result r[ 4 ] = { zero, zero, zero, zero }; + bool initialized( false ); + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int i = 0; i < block_size; i += 4 ) { + if( ! initialized ) + { + arg[ 0 ] = offset + i; + arg[ 1 ] = offset + i + 1; + arg[ 2 ] = offset + i + 2; + arg[ 3 ] = offset + i + 3; + r[ 0 ] = dataFetcher( offset + i ); + r[ 1 ] = dataFetcher( offset + i + 1 ); + r[ 2 ] = dataFetcher( offset + i + 2 ); + r[ 3 ] = dataFetcher( offset + i + 3 ); + initialized = true; + continue; + } + reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); + reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); + } + } + + // reduction of the last, incomplete block (not unrolled) + for( Index i = blocks * block_size; i < size; i++ ) + reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); + + // reduction of unrolled results + reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); + reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); + reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); + return std::make_pair( arg[ 0 ], r[ 0 ] ); + } + else { + std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); + for( Index i = 1; i < size; i++ ) + reduction( result.first, i, result.second, dataFetcher( i ) ); + return result; + } +} + template< typename Index, typename Result, typename ReductionOperation, @@ -48,10 +155,10 @@ reduce( const Index size, DataFetcher& dataFetcher, const Result& zero ) { +#ifdef HAVE_OPENMP constexpr int block_size = 128; const int blocks = size / block_size; -#ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable Result result = zero; @@ -92,42 +199,9 @@ reduce( const Index size, } return result; } - else { -#endif - if( blocks > 1 ) { - // initialize array for unrolled results - Result r[ 4 ] = { zero, zero, zero, zero }; - - // main reduction (explicitly unrolled loop) - for( int b = 0; b < blocks; b++ ) { - const Index offset = b * block_size; - for( int i = 0; i < block_size; i += 4 ) { - r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); - r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); - } - } - - // reduction of the last, incomplete block (not unrolled) - for( Index i = blocks * block_size; i < size; i++ ) - r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); - - // reduction of unrolled results - r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); - r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); - r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); - return r[ 0 ]; - } - else { - Result result = zero; - for( Index i = 0; i < size; i++ ) - result = reduction( result, dataFetcher( i ) ); - return result; - } -#ifdef HAVE_OPENMP - } + else #endif + return Reduction< Devices::Sequential >::reduce( size, reduction, dataFetcher, zero ); } template< typename Index, @@ -141,10 +215,10 @@ reduceWithArgument( const Index size, DataFetcher& dataFetcher, const Result& zero ) { +#ifdef HAVE_OPENMP constexpr int block_size = 128; const int blocks = size / block_size; -#ifdef HAVE_OPENMP if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable std::pair< Index, Result > result( -1, zero ); @@ -201,57 +275,9 @@ reduceWithArgument( const Index size, } return result; } - else { -#endif - if( blocks > 1 ) { - // initialize array for unrolled results - Index arg[ 4 ] = { 0, 0, 0, 0 }; - Result r[ 4 ] = { zero, zero, zero, zero }; - bool initialized( false ); - - // main reduction (explicitly unrolled loop) - for( int b = 0; b < blocks; b++ ) { - const Index offset = b * block_size; - for( int i = 0; i < block_size; i += 4 ) { - if( ! initialized ) - { - arg[ 0 ] = offset + i; - arg[ 1 ] = offset + i + 1; - arg[ 2 ] = offset + i + 2; - arg[ 3 ] = offset + i + 3; - r[ 0 ] = dataFetcher( offset + i ); - r[ 1 ] = dataFetcher( offset + i + 1 ); - r[ 2 ] = dataFetcher( offset + i + 2 ); - r[ 3 ] = dataFetcher( offset + i + 3 ); - initialized = true; - continue; - } - reduction( arg[ 0 ], offset + i, r[ 0 ], dataFetcher( offset + i ) ); - reduction( arg[ 1 ], offset + i + 1, r[ 1 ], dataFetcher( offset + i + 1 ) ); - reduction( arg[ 2 ], offset + i + 2, r[ 2 ], dataFetcher( offset + i + 2 ) ); - reduction( arg[ 3 ], offset + i + 3, r[ 3 ], dataFetcher( offset + i + 3 ) ); - } - } - - // reduction of the last, incomplete block (not unrolled) - for( Index i = blocks * block_size; i < size; i++ ) - reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); - - // reduction of unrolled results - reduction( arg[ 0 ], arg[ 2 ], r[ 0 ], r[ 2 ] ); - reduction( arg[ 1 ], arg[ 3 ], r[ 1 ], r[ 3 ] ); - reduction( arg[ 0 ], arg[ 1 ], r[ 0 ], r[ 1 ] ); - return std::make_pair( arg[ 0 ], r[ 0 ] ); - } - else { - std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); - for( Index i = 1; i < size; i++ ) - reduction( result.first, i, result.second, dataFetcher( i ) ); - return result; - } -#ifdef HAVE_OPENMP - } + else #endif + return Reduction< Devices::Sequential >::reduceWithArgument( size, reduction, dataFetcher, zero ); } template< typename Index, @@ -309,7 +335,7 @@ reduce( const Index size, new Result[ reducedSize ] #endif }; - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -320,7 +346,7 @@ reduce( const Index size, // finish the reduction on the host auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; - const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); + const Result result = Reduction< Devices::Sequential >::reduce( reducedSize, reduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -414,8 +440,8 @@ reduceWithArgument( const Index size, new Index[ reducedSize ] #endif }; - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); - MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); + MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -426,7 +452,7 @@ reduceWithArgument( const Index size, // finish the reduction on the host // auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; -// const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); +// const Result result = Reduction< Devices::Sequential >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); diff --git a/src/TNL/Algorithms/Scan.h b/src/TNL/Algorithms/Scan.h index 2f2275c53f..81a5d2f7e7 100644 --- a/src/TNL/Algorithms/Scan.h +++ b/src/TNL/Algorithms/Scan.h @@ -12,6 +12,7 @@ #pragma once +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -96,11 +97,71 @@ template< typename Device, struct SegmentedScan; +template< ScanType Type > +struct Scan< Devices::Sequential, Type > +{ + /** + * \brief Computes scan (prefix sum) sequentially. + * + * \tparam Vector type vector being used for the scan. + * \tparam Reduction lambda function defining the reduction operation + * + * \param v input vector, the result of scan is stored in the same vector + * \param begin the first element in the array to be scanned + * \param end the last element in the array to be scanned + * \param reduction lambda function implementing the reduction operation + * \param zero is the idempotent element for the reduction operation, i.e. element which + * does not change the result of the reduction. + * + * The reduction lambda function takes two variables which are supposed to be reduced: + * + * ``` + * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; + * ``` + * + * \par Example + * + * \include ReductionAndScan/ScanExample.cpp + * + * \par Output + * + * \include ScanExample.out + */ + template< typename Vector, + typename Reduction > + static void + perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename Reduction > + static auto + performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename BlockShifts, + typename Reduction > + static void + performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ); +}; + template< ScanType Type > struct Scan< Devices::Host, Type > { /** - * \brief Computes scan (prefix sum) on CPU. + * \brief Computes scan (prefix sum) using OpenMP. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation @@ -216,11 +277,55 @@ struct Scan< Devices::Cuda, Type > const typename Vector::RealType shift ); }; +template< ScanType Type > +struct SegmentedScan< Devices::Sequential, Type > +{ + /** + * \brief Computes segmented scan (prefix sum) sequentially. + * + * \tparam Vector type vector being used for the scan. + * \tparam Reduction lambda function defining the reduction operation + * \tparam Flags array type containing zeros and ones defining the segments begining + * + * \param v input vector, the result of scan is stored in the same vector + * \param flags is an array with zeros and ones defining the segments begining + * \param begin the first element in the array to be scanned + * \param end the last element in the array to be scanned + * \param reduction lambda function implementing the reduction operation + * \param zero is the idempotent element for the reduction operation, i.e. element which + * does not change the result of the reduction. + * + * The reduction lambda function takes two variables which are supposed to be reduced: + * + * ``` + * auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... }; + * ``` + * + * \par Example + * + * \include ReductionAndScan/SegmentedScanExample.cpp + * + * \par Output + * + * \include SegmentedScanExample.out + */ + template< typename Vector, + typename Reduction, + typename Flags > + static void + perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); +}; + template< ScanType Type > struct SegmentedScan< Devices::Host, Type > { /** - * \brief Computes segmented scan (prefix sum) on CPU. + * \brief Computes segmented scan (prefix sum) using OpenMP. * * \tparam Vector type vector being used for the scan. * \tparam Reduction lambda function defining the reduction operation diff --git a/src/TNL/Algorithms/Scan.hpp b/src/TNL/Algorithms/Scan.hpp index bb2288c6ac..7b6d31ece5 100644 --- a/src/TNL/Algorithms/Scan.hpp +++ b/src/TNL/Algorithms/Scan.hpp @@ -24,6 +24,78 @@ namespace TNL { namespace Algorithms { +template< ScanType Type > + template< typename Vector, + typename Reduction > +void +Scan< Devices::Sequential, Type >:: +perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) +{ + // sequential prefix-sum does not need a second phase + performFirstPhase( v, begin, end, reduction, zero ); +} + +template< ScanType Type > + template< typename Vector, + typename Reduction > +auto +Scan< Devices::Sequential, Type >:: +performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) +{ + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + // FIXME: StaticArray does not have getElement() which is used in DistributedScan +// return Containers::StaticArray< 1, RealType > block_sums; + Containers::Array< RealType, Devices::Host > block_sums( 1 ); + block_sums[ 0 ] = zero; + + if( Type == ScanType::Inclusive ) { + for( IndexType i = begin + 1; i < end; i++ ) + v[ i ] = reduction( v[ i ], v[ i - 1 ] ); + block_sums[ 0 ] = v[ end - 1 ]; + } + else // Exclusive prefix sum + { + RealType aux = zero; + for( IndexType i = begin; i < end; i++ ) { + const RealType x = v[ i ]; + v[ i ] = aux; + aux = reduction( aux, x ); + } + block_sums[ 0 ] = aux; + } + + return block_sums; +} + +template< ScanType Type > + template< typename Vector, + typename BlockShifts, + typename Reduction > +void +Scan< Devices::Sequential, Type >:: +performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ) +{ + using IndexType = typename Vector::IndexType; + + for( IndexType i = begin; i < end; i++ ) + v[ i ] = reduction( v[ i ], shift ); +} + template< ScanType Type > template< typename Vector, typename Reduction > @@ -39,8 +111,7 @@ perform( Vector& v, const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero ); performSecondPhase( v, blockShifts, begin, end, reduction, zero ); #else - // sequential prefix-sum does not need a second phase - performFirstPhase( v, begin, end, reduction, zero ); + Scan< Devices::Sequential, Type >::perform( v, begin, end, reduction, zero ); #endif } @@ -55,12 +126,12 @@ performFirstPhase( Vector& v, const Reduction& reduction, const typename Vector::RealType zero ) { +#ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; -#ifdef HAVE_OPENMP const int threads = Devices::Host::getMaxThreadsCount(); - Containers::Array< RealType, Devices::Host > block_sums( threads + 1 ); + Containers::Array< RealType > block_sums( threads + 1 ); block_sums[ 0 ] = zero; #pragma omp parallel num_threads(threads) @@ -98,28 +169,7 @@ performFirstPhase( Vector& v, // block_sums now contains shift values for each block - to be used in the second phase return block_sums; #else - // FIXME: StaticArray does not have getElement() which is used in DistributedScan -// return Containers::StaticArray< 1, RealType > block_sums; - Containers::Array< RealType, Devices::Host > block_sums( 1 ); - block_sums[ 0 ] = zero; - - if( Type == ScanType::Inclusive ) { - for( IndexType i = begin + 1; i < end; i++ ) - v[ i ] = reduction( v[ i ], v[ i - 1 ] ); - block_sums[ 0 ] = v[ end - 1 ]; - } - else // Exclusive prefix sum - { - RealType aux = zero; - for( IndexType i = begin; i < end; i++ ) { - const RealType x = v[ i ]; - v[ i ] = aux; - aux = reduction( aux, x ); - } - block_sums[ 0 ] = aux; - } - - return block_sums; + return Scan< Devices::Sequential, Type >::performFirstPhase( v, begin, end, reduction, zero ); #endif } @@ -136,10 +186,10 @@ performSecondPhase( Vector& v, const Reduction& reduction, const typename Vector::RealType shift ) { +#ifdef HAVE_OPENMP using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; -#ifdef HAVE_OPENMP const int threads = blockShifts.getSize() - 1; // launch exactly the same number of threads as in the first phase @@ -154,8 +204,7 @@ performSecondPhase( Vector& v, v[ i ] = reduction( v[ i ], offset ); } #else - for( IndexType i = begin; i < end; i++ ) - v[ i ] = reduction( v[ i ], shift ); + Scan< Devices::Sequential, Type >::performSecondPhase( v, blockShifts, begin, end, reduction, shift ); #endif } @@ -245,7 +294,7 @@ template< ScanType Type > typename Reduction, typename Flags > void -SegmentedScan< Devices::Host, Type >:: +SegmentedScan< Devices::Sequential, Type >:: perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, @@ -256,7 +305,6 @@ perform( Vector& v, using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - // TODO: parallelize with OpenMP if( Type == ScanType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) @@ -278,6 +326,27 @@ perform( Vector& v, } } +template< ScanType Type > + template< typename Vector, + typename Reduction, + typename Flags > +void +SegmentedScan< Devices::Host, Type >:: +perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) +{ +#ifdef HAVE_OPENMP + // TODO: parallelize with OpenMP + SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); +#else + SegmentedScan< Devices::Sequential, Type >::perform( v, flags, begin, end, reduction, zero ); +#endif +} + template< ScanType Type > template< typename Vector, typename Reduction, @@ -295,7 +364,7 @@ perform( Vector& v, using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); + throw Exceptions::NotImplementedError( "Segmented scan (prefix sum) is not implemented for CUDA." ); #else throw Exceptions::CudaSupportMissing(); #endif diff --git a/src/TNL/Allocators/Default.h b/src/TNL/Allocators/Default.h index eed5c193b9..109539d0c9 100644 --- a/src/TNL/Allocators/Default.h +++ b/src/TNL/Allocators/Default.h @@ -14,6 +14,7 @@ #include <TNL/Allocators/Host.h> #include <TNL/Allocators/Cuda.h> +#include <TNL/Devices/Sequential.h> #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> @@ -27,6 +28,14 @@ namespace Allocators { template< typename Device > struct Default; +//! Sets \ref Allocators::Host as the default allocator for \ref Devices::Sequential. +template<> +struct Default< Devices::Sequential > +{ + template< typename T > + using Allocator = Allocators::Host< T >; +}; + //! Sets \ref Allocators::Host as the default allocator for \ref Devices::Host. template<> struct Default< Devices::Host > diff --git a/src/TNL/Containers/NDArray.h b/src/TNL/Containers/NDArray.h index 8472f4d715..3cbc8a7bc1 100644 --- a/src/TNL/Containers/NDArray.h +++ b/src/TNL/Containers/NDArray.h @@ -352,13 +352,13 @@ class StaticNDArray SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, - void > + Devices::Sequential > { using Base = NDArrayStorage< StaticArray< __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get(), Value >, SizesHolder, Permutation, __ndarray_impl::NDArrayBase< SliceInfo< 0, 0 > >, - void >; + Devices::Sequential >; static_assert( __ndarray_impl::StaticStorageSizeGetter< SizesHolder >::get() > 0, "All dimensions of a static array must to be positive." ); diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h index 1156075834..4af7892ecc 100644 --- a/src/TNL/Devices/Host.h +++ b/src/TNL/Devices/Host.h @@ -19,7 +19,6 @@ #endif namespace TNL { -//! \brief Namespace for TNL execution models namespace Devices { class Host diff --git a/src/TNL/Devices/Sequential.h b/src/TNL/Devices/Sequential.h new file mode 100644 index 0000000000..f00660f196 --- /dev/null +++ b/src/TNL/Devices/Sequential.h @@ -0,0 +1,21 @@ +/*************************************************************************** + Sequential.h - description + ------------------- + begin : Aug 17, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +namespace TNL { +//! \brief Namespace for TNL execution models +namespace Devices { + +struct Sequential +{}; + +} // namespace Devices +} // namespace TNL -- GitLab