Commit 7756e2d0 authored by Jakub Klinkovský's avatar Jakub Klinkovský

Added Devices::Sequential and corresponding specializations in TNL::Algorithms

parent dbfa5d11
......@@ -351,7 +351,7 @@ struct CudaReductionKernelLauncher
// Copy result on CPU
Result result;
MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 );
MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result, output, 1 );
return result;
}
......@@ -384,8 +384,8 @@ struct CudaReductionKernelLauncher
////
// Copy result on CPU
std::pair< Index, Result > result;
MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( &result.second, output, 1 );
MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.first, idxOutput, 1 );
MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( &result.second, output, 1 );
return result;
}
......
......@@ -10,6 +10,7 @@
#pragma once
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CudaCallable.h>
......@@ -17,12 +18,11 @@
namespace TNL {
namespace Algorithms {
template< typename DestinationExecution >
template< typename DestinationDevice >
struct MemoryOperations;
// TODO: change "void" to "Execution::Sequential"
template<>
struct MemoryOperations< void >
struct MemoryOperations< Devices::Sequential >
{
template< typename Element >
__cuda_callable__
......
......@@ -93,7 +93,7 @@ copyFromIterator( DestinationElement* destination,
SourceIterator first,
SourceIterator last )
{
MemoryOperations< void >::copyFromIterator( destination, destinationSize, first, last );
MemoryOperations< Devices::Sequential >::copyFromIterator( destination, destinationSize, first, last );
}
template< typename DestinationElement,
......@@ -137,7 +137,7 @@ containsValue( const Element* data,
}
else {
// sequential algorithm can return as soon as it finds a match
return MemoryOperations< void >::containsValue( data, size, value );
return MemoryOperations< Devices::Sequential >::containsValue( data, size, value );
}
}
......@@ -159,7 +159,7 @@ containsOnlyValue( const Element* data,
}
else {
// sequential algorithm can return as soon as it finds a mismatch
return MemoryOperations< void >::containsOnlyValue( data, size, value );
return MemoryOperations< Devices::Sequential >::containsOnlyValue( data, size, value );
}
}
......
......@@ -18,7 +18,7 @@ namespace Algorithms {
template< typename Element >
__cuda_callable__
void
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
setElement( Element* data,
const Element& value )
{
......@@ -28,7 +28,7 @@ setElement( Element* data,
template< typename Element >
__cuda_callable__
Element
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
getElement( const Element* data )
{
return *data;
......@@ -37,7 +37,7 @@ getElement( const Element* data )
template< typename Element, typename Index >
__cuda_callable__
void
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
set( Element* data,
const Element& value,
const Index size )
......@@ -51,7 +51,7 @@ template< typename DestinationElement,
typename Index >
__cuda_callable__
void
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
copy( DestinationElement* destination,
const SourceElement* source,
const Index size )
......@@ -64,7 +64,7 @@ template< typename DestinationElement,
typename Index,
typename SourceIterator >
void
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
copyFromIterator( DestinationElement* destination,
Index destinationSize,
SourceIterator first,
......@@ -82,7 +82,7 @@ template< typename Element1,
typename Index >
__cuda_callable__
bool
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
compare( const Element1* destination,
const Element2* source,
const Index size )
......@@ -97,7 +97,7 @@ template< typename Element,
typename Index >
__cuda_callable__
bool
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
containsValue( const Element* data,
const Index size,
const Element& value )
......@@ -116,7 +116,7 @@ template< typename Element,
typename Index >
__cuda_callable__
bool
MemoryOperations< void >::
MemoryOperations< Devices::Sequential >::
containsOnlyValue( const Element* data,
const Index size,
const Element& value )
......
......@@ -14,6 +14,7 @@
#include <functional> // reduction functions like std::plus, std::logical_and, std::logical_or etc.
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
......@@ -23,6 +24,35 @@ namespace Algorithms {
template< typename Device >
struct Multireduction;
template<>
struct Multireduction< Devices::Sequential >
{
/**
* Parameters:
* zero: starting value for reduction
* dataFetcher: callable object such that `dataFetcher( i, j )` yields
* the i-th value to be reduced from the j-th dataset
* (i = 0,...,size-1; j = 0,...,n-1)
* reduction: callable object representing the reduction operation
* for example, it can be an instance of std::plus, std::logical_and,
* std::logical_or etc.
* size: the size of each dataset
* n: number of datasets to be reduced
* result: output array of size = n
*/
template< typename Result,
typename DataFetcher,
typename Reduction,
typename Index >
static constexpr void
reduce( const Result zero,
DataFetcher dataFetcher,
const Reduction reduction,
const Index size,
const int n,
Result* result );
};
template<>
struct Multireduction< Devices::Host >
{
......
......@@ -29,6 +29,83 @@
namespace TNL {
namespace Algorithms {
template< typename Result,
typename DataFetcher,
typename Reduction,
typename Index >
void constexpr
Multireduction< Devices::Sequential >::
reduce( const Result zero,
DataFetcher dataFetcher,
const Reduction reduction,
const Index size,
const int n,
Result* result )
{
TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
constexpr int block_size = 128;
const int blocks = size / block_size;
if( blocks > 1 ) {
// initialize array for unrolled results
// (it is accessed as a row-major matrix with n rows and 4 columns)
Result r[ n * 4 ];
for( int k = 0; k < n * 4; k++ )
r[ k ] = zero;
// main reduction (explicitly unrolled loop)
for( int b = 0; b < blocks; b++ ) {
const Index offset = b * block_size;
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
for( int i = 0; i < block_size; i += 4 ) {
_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
_r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
_r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
_r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
}
}
}
// reduction of the last, incomplete block (not unrolled)
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
for( Index i = blocks * block_size; i < size; i++ )
_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
}
// reduction of unrolled results
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
_r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
_r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
_r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
// copy the result into the output parameter
result[ k ] = _r[ 0 ];
}
}
else {
for( int k = 0; k < n; k++ )
result[ k ] = zero;
for( int b = 0; b < blocks; b++ ) {
const Index offset = b * block_size;
for( int k = 0; k < n; k++ ) {
for( int i = 0; i < block_size; i++ )
result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
}
}
for( int k = 0; k < n; k++ ) {
for( Index i = blocks * block_size; i < size; i++ )
result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
}
}
}
template< typename Result,
typename DataFetcher,
typename Reduction,
......@@ -45,10 +122,10 @@ reduce( const Result zero,
TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." );
TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." );
#ifdef HAVE_OPENMP
constexpr int block_size = 128;
const int blocks = size / block_size;
#ifdef HAVE_OPENMP
if( Devices::Host::isOMPEnabled() && blocks >= 2 ) {
const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() );
#pragma omp parallel num_threads(threads)
......@@ -106,67 +183,9 @@ reduce( const Result zero,
}
}
}
else {
#endif
if( blocks > 1 ) {
// initialize array for unrolled results
// (it is accessed as a row-major matrix with n rows and 4 columns)
Result r[ n * 4 ];
for( int k = 0; k < n * 4; k++ )
r[ k ] = zero;
// main reduction (explicitly unrolled loop)
for( int b = 0; b < blocks; b++ ) {
const Index offset = b * block_size;
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
for( int i = 0; i < block_size; i += 4 ) {
_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) );
_r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) );
_r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) );
_r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) );
}
}
}
// reduction of the last, incomplete block (not unrolled)
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
for( Index i = blocks * block_size; i < size; i++ )
_r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) );
}
// reduction of unrolled results
for( int k = 0; k < n; k++ ) {
Result* _r = r + 4 * k;
_r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] );
_r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] );
_r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] );
// copy the result into the output parameter
result[ k ] = _r[ 0 ];
}
}
else {
for( int k = 0; k < n; k++ )
result[ k ] = zero;
for( int b = 0; b < blocks; b++ ) {
const Index offset = b * block_size;
for( int k = 0; k < n; k++ ) {
for( int i = 0; i < block_size; i++ )
result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) );
}
}
for( int k = 0; k < n; k++ ) {
for( Index i = blocks * block_size; i < size; i++ )
result[ k ] = reduction( result[ k ], dataFetcher( i, k ) );
}
}
#ifdef HAVE_OPENMP
}
else
#endif
Multireduction< Devices::Sequential >::reduce( zero, dataFetcher, reduction, size, n, result );
}
template< typename Result,
......@@ -204,7 +223,7 @@ reduce( const Result zero,
// transfer the reduced data from device to host
std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] };
MultiDeviceMemoryOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
MultiDeviceMemoryOperations< void, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize );
#ifdef CUDA_REDUCTION_PROFILING
timer.stop();
......@@ -215,7 +234,7 @@ reduce( const Result zero,
// finish the reduction on the host
auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; };
Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
Multireduction< Devices::Sequential >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult );
#ifdef CUDA_REDUCTION_PROFILING
timer.stop();
......
......@@ -10,11 +10,13 @@
#pragma once
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CheckDevice.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/LaunchHelpers.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Math.h>
/****
......@@ -33,9 +35,53 @@ namespace Algorithms {
enum ParallelForMode { SynchronousMode, AsynchronousMode };
template< typename Device = Devices::Host,
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index start, Index end, Function f, FunctionArgs... args )
{
for( Index i = start; i < end; i++ )
f( i, args... );
}
};
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor2D
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index endX, Index endY, Function f, FunctionArgs... args )
{
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, args... );
}
};
template< typename Device = Devices::Sequential,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor3D
{
template< typename Index,
typename Function,
typename... FunctionArgs >
static void exec( Index startX, Index startY, Index startZ, Index endX, Index endY, Index endZ, Function f, FunctionArgs... args )
{
for( Index k = startZ; k < endZ; k++ )
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, k, args... );
}
};
template< ParallelForMode Mode >
struct ParallelFor< Devices::Host, Mode >
{
template< typename Index,
typename Function,
......@@ -44,26 +90,23 @@ struct ParallelFor
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )'
if( TNL::Devices::Host::isOMPEnabled() && end - start > 512 )
// to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() && end - start > 512 )'
if( Devices::Host::isOMPEnabled() && end - start > 512 )
{
#pragma omp parallel for
#pragma omp parallel for
for( Index i = start; i < end; i++ )
f( i, args... );
}
else
for( Index i = start; i < end; i++ )
f( i, args... );
ParallelFor< Devices::Sequential >::exec( start, end, f, args... );
#else
for( Index i = start; i < end; i++ )
f( i, args... );
ParallelFor< Devices::Sequential >::exec( start, end, f, args... );
#endif
}
};
template< typename Device = Devices::Host,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor2D
template< ParallelForMode Mode >
struct ParallelFor2D< Devices::Host, Mode >
{
template< typename Index,
typename Function,
......@@ -72,30 +115,24 @@ struct ParallelFor2D
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
if( TNL::Devices::Host::isOMPEnabled() )
// to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )'
if( Devices::Host::isOMPEnabled() )
{
#pragma omp parallel for
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, args... );
}
else {
#pragma omp parallel for
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, args... );
}
else
ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... );
#else
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, args... );
ParallelFor2D< Devices::Sequential >::exec( startX, startY, endX, endY, f, args... );
#endif
}
};
template< typename Device = Devices::Host,
ParallelForMode Mode = SynchronousMode >
struct ParallelFor3D
template< ParallelForMode Mode >
struct ParallelFor3D< Devices::Host, Mode >
{
template< typename Index,
typename Function,
......@@ -104,27 +141,19 @@ struct ParallelFor3D
{
#ifdef HAVE_OPENMP
// Benchmarks show that this is significantly faster compared
// to '#pragma omp parallel for if( TNL::Devices::Host::isOMPEnabled() )'
if( TNL::Devices::Host::isOMPEnabled() )
{
#pragma omp parallel for collapse(2)
for( Index k = startZ; k < endZ; k++ )
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, k, args... );
}
else
// to '#pragma omp parallel for if( Devices::Host::isOMPEnabled() )'
if( Devices::Host::isOMPEnabled() )
{
#pragma omp parallel for collapse(2)
for( Index k = startZ; k < endZ; k++ )
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, k, args... );
}
else
ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... );
#else
for( Index k = startZ; k < endZ; k++ )
for( Index j = startY; j < endY; j++ )
for( Index i = startX; i < endX; i++ )
f( i, j, k, args... );
ParallelFor3D< Devices::Sequential >::exec( startX, startY, startZ, endX, endY, endZ, f, args... );
#endif
}
};
......
......@@ -15,6 +15,7 @@
#include <utility> // std::pair
#include <functional> // reduction functions like std::plus, std::logical_and, std::logical_or etc.
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
......@@ -36,6 +37,30 @@ namespace Algorithms {
template< typename Device >
struct Reduction;
template<>
struct Reduction< Devices::Sequential >
{
template< typename Index,
typename Result,
typename ReductionOperation,
typename DataFetcher >
static constexpr Result
reduce( const Index size,
const ReductionOperation& reduction,
DataFetcher& dataFetcher,
const Result& zero );
template< typename Index,
typename Result,
typename ReductionOperation,
typename DataFetcher >
static constexpr std::pair< Index, Result >
reduceWithArgument( const Index size,
const ReductionOperation& reduction,
DataFetcher& dataFetcher,
const Result& zero );
};
template<>
struct Reduction< Devices::Host >
{
......
This diff is collapsed.
......@@ -12,6 +12,7 @@
#pragma once
#include <TNL/Devices/Sequential.h>
#include <TNL/Devices/Host.h>
#include <TNL/Devices/Cuda.h>
......@@ -96,11 +97,71 @@ template< typename Device,
struct SegmentedScan;
template< ScanType Type >
struct Scan< Devices::Sequential, Type >
{
/**
* \brief Computes scan (prefix sum) sequentially.
*
* \tparam Vector type vector being used for the scan.
* \tparam Reduction lambda function defining the reduction operation
*
* \param v input vector, the result of scan is stored in the same vector
* \param begin the first element in the array to be scanned
* \param end the last element in the array to be scanned
* \param reduction lambda function implementing the reduction operation
* \param zero is the idempotent element for the reduction operation, i.e. element which
* does not change the result of the reduction.
*
* The reduction lambda function takes two variables which are supposed to be reduced:
*
* ```
* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
* ```
*
* \par Example
*
* \include ReductionAndScan/ScanExample.cpp
*
* \par Output
*
* \include ScanExample.out
*/
template< typename Vector,
typename Reduction >
static void
perform( Vector& v,
const typename Vector::IndexType begin,
const typename Vector::IndexType end,
const Reduction& reduction,
const typename Vector::RealType zero );
template< typename Vector,
typename Reduction >
static auto
performFirstPhase( Vector& v,
const typename Vector::IndexType begin,
const typename Vector::IndexType end,
const Reduction& reduction,
const typename Vector::RealType zero );
template< typename Vector,
typename BlockShifts,
typename Reduction >
static void
performSecondPhase( Vector& v,
const BlockShifts& blockShifts,
const typename Vector::IndexType begin,
const typename Vector::IndexType end,
const Reduction& reduction,
const typename Vector::RealType shift );
};
template< ScanType Type >
struct Scan< Devices::Host, Type >
{
/**
* \brief Computes scan (prefix sum) on CPU.
* \brief Computes scan (prefix sum) using OpenMP.
*
* \tparam Vector type vector being used for the scan.
* \tparam Reduction lambda function defining the reduction operation
......@@ -216,11 +277,55 @@ struct Scan< Devices::Cuda, Type >
const typename Vector::RealType shift );
};
template< ScanType Type >
struct SegmentedScan< Devices::Sequential, Type >
{
/**
* \brief Computes segmented scan (prefix sum) sequentially.
*
* \tparam Vector type vector being used for the scan.
* \tparam Reduction lambda function defining the reduction operation
* \tparam Flags array type containing zeros and ones defining the segments begining
*
* \param v input vector, the result of scan is stored in the same vector
* \param flags is an array with zeros and ones defining the segments begining
* \param begin the first element in the array to be scanned
* \param end the last element in the array to be scanned
* \param reduction lambda function implementing the reduction operation
* \param zero is the idempotent element for the reduction operation, i.e. element which
* does not change the result of the reduction.
*
* The reduction lambda function takes two variables which are supposed to be reduced:
*
* ```
* auto reduction = [] __cuda_callable__ ( const Result& a, const Result& b ) { return ... };
* ```
*
* \par Example
*
* \include ReductionAndScan/SegmentedScanExample.cpp
*
* \par Output
*
* \include SegmentedScanExample.out
*/
template< typename Vector,
typename Reduction,
typename Flags >
static void
perform( Vector& v,
Flags& flags,
const typename Vector::IndexType begin,
const typename Vector::IndexType end,
const Reduction& reduction,
const typename Vector::RealType zero );
};
template< ScanType Type >
struct SegmentedScan< Devices::Host, Type >
{
/**
* \brief Computes segmented scan (prefix sum) on CPU.
* \brief Computes segmented scan (prefix sum) using OpenMP.
*
* \tparam Vector type vector being used for the scan.
* \tparam Reduction lambda function defining the reduction operation
......