From 4b52cc2bcfdf4ae836f6b831fb1ebb3ddbc44e38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Sun, 14 Apr 2019 22:43:35 +0200 Subject: [PATCH] Rewritting lambdas with references. --- .../Algorithms/ArrayOperationsCuda.hpp | 29 +- .../Algorithms/CommonVectorOperations.hpp | 375 ++++++++++++++++++ .../Algorithms/CudaReductionKernel.h | 72 ++-- src/TNL/Containers/Algorithms/Reduction.h | 8 +- .../{Reduction_impl.h => Reduction.hpp} | 78 ++-- 5 files changed, 465 insertions(+), 97 deletions(-) create mode 100644 src/TNL/Containers/Algorithms/CommonVectorOperations.hpp rename src/TNL/Containers/Algorithms/{Reduction_impl.h => Reduction.hpp} (74%) diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index a12b9c67fb..bede985bc5 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -205,12 +205,10 @@ compareMemory( const Element1* destination, TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( destination[ i ] == source[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); - - /*Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; - return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source );*/ + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } template< typename Element, @@ -225,9 +223,10 @@ containsValue( const Element* data, TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a || b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); } template< typename Element, @@ -243,14 +242,10 @@ containsOnlyValue( const Element* data, if( size == 0 ) return false; if( size == 0 ) return false; - auto fetch = [=] __cuda_callable__ ( Index i ) { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( const bool a, const bool b ) { return a && b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); - - - /*Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; - reductionContainsOnlyValue.setValue( value ); - return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr );*/ + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; + auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; + //return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp new file mode 100644 index 0000000000..1dc43b2c72 --- /dev/null +++ b/src/TNL/Containers/Algorithms/CommonVectorOperations.hpp @@ -0,0 +1,375 @@ +/*************************************************************************** + CommonVectorOperations.hpp - description + ------------------- + begin : Apr 12, 2019 + copyright : (C) 2019 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Containers/Algorithms/CommonVectorOperations.h> +#include <TNL/Containers/Algorithms/Reduction.h> + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMax( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorAbsMin( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL1Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorL2Norm( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorLpNorm( const Vector& v, + const Scalar p ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + if( p == 1.0 ) + return getVectorL1Norm< Vector, ResultType >( v ); + if( p == 2.0 ) + return getVectorL2Norm< Vector, ResultType >( v ); + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorSum( const Vector& v ) +{ + TNL_ASSERT_GT( v.getSize(), 0, "Vector size must be positive." ); + + if( std::is_same< ResultType, bool >::value ) + abort(); + + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + const auto* data = v.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMax( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, -std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceAbsMin( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL1Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceL2Norm( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { + auto diff = data1[ i ] - data2[ i ]; + return diff * diff; + }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::sqrt( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType, typename Scalar > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceLpNorm( const Vector1& v1, + const Vector2& v2, + const Scalar p ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + TNL_ASSERT_GE( p, 1.0, "Parameter of the L^p norm must be at least 1.0." ); + + if( p == 1.0 ) + return getVectorDifferenceL1Norm< Vector1, Vector2, ResultType >( v1, v2 ); + if( p == 2.0 ) + return getVectorDifferenceL2Norm< Vector1, Vector2, ResultType >( v1, v2 ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return std::pow( Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getVectorDifferenceSum( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +template< typename Device > + template< typename Vector1, typename Vector2, typename ResultType > +ResultType +CommonVectorOperations< Device >:: +getScalarProduct( const Vector1& v1, + const Vector2& v2 ) +{ + TNL_ASSERT_GT( v1.getSize(), 0, "Vector size must be positive." ); + TNL_ASSERT_EQ( v1.getSize(), v2.getSize(), "The vector sizes must be the same." ); + + using RealType = typename Vector1::RealType; + using IndexType = typename Vector1::IndexType; + + const auto* data1 = v1.getData(); + const auto* data2 = v2.getData(); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; + auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; + auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; + return Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); +} + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 8fea90f8f4..a90e0a3aab 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -43,12 +43,14 @@ template< int blockSize, typename Result, typename DataFetcher, typename Reduction, + typename VolatileReduction, typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, + const VolatileReduction volatileReduction, const Index size, Result* output ) { @@ -74,21 +76,21 @@ CudaReductionKernel( const Result zero, */ while( gid + 4 * gridSize < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); @@ -100,19 +102,19 @@ CudaReductionKernel( const Result zero, if( blockSize >= 1024 ) { if( tid < 512 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); + reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); + reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); + reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); //printf( "2: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -120,7 +122,7 @@ CudaReductionKernel( const Result zero, if( blockSize >= 128 ) { if( tid < 64 ) - sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); + reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); //printf( "3: tid %d data %f \n", tid, sdata[ tid ] ); } @@ -133,34 +135,34 @@ CudaReductionKernel( const Result zero, volatile ResultType* vsdata = sdata; if( blockSize >= 64 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 32 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); //printf( "4: tid %d data %f \n", tid, sdata[ tid ] ); } // TODO: If blocksize == 32, the following does not work // We do not check if tid < 16. Fix it!!! if( blockSize >= 32 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 16 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); //printf( "5: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 16 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 8 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); //printf( "6: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 8 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 4 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); //printf( "7: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 4 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 2 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); //printf( "8: tid %d data %f \n", tid, sdata[ tid ] ); } if( blockSize >= 2 ) { - vsdata[ tid ] = reduction( vsdata[ tid ], vsdata[ tid + 1 ] ); + volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); //printf( "9: tid %d data %f \n", tid, sdata[ tid ] ); } } @@ -178,11 +180,13 @@ CudaReductionKernel( const Result zero, template< typename DataFetcher, typename Reduction, + typename VolatileReduction, typename Index, typename Result > int CudaReductionKernelLauncher( const Index size, const Reduction& reduction, + const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result*& output ) @@ -226,55 +230,55 @@ CudaReductionKernelLauncher( const Index size, { case 512: CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); break; case 1: TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index ece9016c59..7ac0d4c028 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -32,10 +32,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -47,10 +49,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -62,10 +66,12 @@ public: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; @@ -74,4 +80,4 @@ public: } // namespace Containers } // namespace TNL -#include "Reduction_impl.h" +#include "Reduction.hpp" diff --git a/src/TNL/Containers/Algorithms/Reduction_impl.h b/src/TNL/Containers/Algorithms/Reduction.hpp similarity index 74% rename from src/TNL/Containers/Algorithms/Reduction_impl.h rename to src/TNL/Containers/Algorithms/Reduction.hpp index 0fee746cf0..22121de25f 100644 --- a/src/TNL/Containers/Algorithms/Reduction_impl.h +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -42,11 +42,13 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25 template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Cuda >:: reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -94,10 +96,11 @@ Reduction< Devices::Cuda >:: */ ResultType* deviceAux1( 0 ); IndexType reducedSize = CudaReductionKernelLauncher( size, - reduction, - dataFetcher, - zero, - deviceAux1 ); + reduction, + volatileReduction, + dataFetcher, + zero, + deviceAux1 ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -111,7 +114,6 @@ Reduction< Devices::Cuda >:: */ //ResultType* resultArray[ reducedSize ]; std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; - //ResultType* resultArray = new ResultType[ reducedSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -125,28 +127,26 @@ Reduction< Devices::Cuda >:: * Reduce the data on the host system. */ auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); + const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif - - //delete[] resultArray; return result; } else { /*** * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. */ - //LaterReductionOperation laterReductionOperation; auto copyFetch = [=] __cuda_callable__ ( IndexType i ) { return deviceAux1[ i ]; }; while( reducedSize > 1 ) { reducedSize = CudaReductionKernelLauncher( reducedSize, - reduction, - copyFetch, - zero, - deviceAux1 ); + reduction, + volatileReduction, + copyFetch, + zero, + deviceAux1 ); } #ifdef CUDA_REDUCTION_PROFILING @@ -175,11 +175,13 @@ Reduction< Devices::Cuda >:: template< typename Index, typename Result, typename ReductionOperation, + typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Host >:: reduce( const Index size, ReductionOperation& reduction, + VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ) { @@ -202,14 +204,10 @@ Reduction< Devices::Host >:: for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); - r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); - /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); - operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); - operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); - operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } @@ -217,19 +215,18 @@ Reduction< Devices::Host >:: #pragma omp single nowait { for( IndexType i = blocks * block_size; i < size; i++ ) - r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); - //operation.dataFetcher( r[ 0 ], i, input1, input2 ); + reduction( r[ 0 ], dataFetcher( i ) ); } // local reduction of unrolled results - r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); - r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); - r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - result = reduction( result, r[ 0 ] ); + reduction( result, r[ 0 ] ); } } return result; @@ -244,37 +241,28 @@ Reduction< Devices::Host >:: for( int b = 0; b < blocks; b++ ) { const IndexType offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); - r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); - /*operation.dataFetcher( r[ 0 ], offset + i, input1, input2 ); - operation.dataFetcher( r[ 1 ], offset + i + 1, input1, input2 ); - operation.dataFetcher( r[ 2 ], offset + i + 2, input1, input2 ); - operation.dataFetcher( r[ 3 ], offset + i + 3, input1, input2 );*/ + reduction( r[ 0 ], dataFetcher( offset + i ) ); + reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // reduction of the last, incomplete block (not unrolled) for( IndexType i = blocks * block_size; i < size; i++ ) - r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); + reduction( r[ 0 ], dataFetcher( i ) ); //operation.dataFetcher( r[ 0 ], i, input1, input2 ); // reduction of unrolled results - r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); - r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); - r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); - - /*operation.reduction( r[ 0 ], r[ 2 ] ); - operation.reduction( r[ 1 ], r[ 3 ] ); - operation.reduction( r[ 0 ], r[ 1 ] );*/ - + reduction( r[ 0 ], r[ 2 ] ); + reduction( r[ 1 ], r[ 3 ] ); + reduction( r[ 0 ], r[ 1 ] ); return r[ 0 ]; } else { ResultType result = zero; for( IndexType i = 0; i < size; i++ ) - result = reduction( result, dataFetcher( i ) ); + reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP -- GitLab