Loading src/Benchmarks/BLAS/CommonVectorOperations.hpp +17 −34 Original line number Diff line number Diff line Loading @@ -31,8 +31,7 @@ getVectorMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -49,8 +48,7 @@ getVectorMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -67,8 +65,7 @@ getVectorAbsMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -85,8 +82,7 @@ getVectorAbsMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -103,8 +99,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -121,8 +116,7 @@ getVectorL2Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > Loading @@ -146,8 +140,7 @@ getVectorLpNorm( const Vector& v, const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > Loading @@ -167,8 +160,7 @@ getVectorSum( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -188,8 +180,7 @@ getVectorDifferenceMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -209,8 +200,7 @@ getVectorDifferenceMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -230,8 +220,7 @@ getVectorDifferenceAbsMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -251,8 +240,7 @@ getVectorDifferenceAbsMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -272,8 +260,7 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -296,8 +283,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, return diff * diff; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > Loading @@ -324,8 +310,7 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > Loading @@ -345,8 +330,7 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -366,8 +350,7 @@ getScalarProduct( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } } // namespace Benchmarks Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +3 −6 Original line number Diff line number Diff line Loading @@ -135,8 +135,7 @@ compare( const Element1* destination, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } template< typename Element, Loading @@ -153,8 +152,7 @@ containsValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, Loading @@ -171,8 +169,7 @@ containsOnlyValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } Loading src/TNL/Containers/Algorithms/Multireduction.h +0 −6 Original line number Diff line number Diff line Loading @@ -32,7 +32,6 @@ struct Multireduction< Devices::Host > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * result: output array of size = n Loading @@ -40,13 +39,11 @@ struct Multireduction< Devices::Host > template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* result ); Loading @@ -62,7 +59,6 @@ struct Multireduction< Devices::Cuda > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n Loading @@ -70,13 +66,11 @@ struct Multireduction< Devices::Cuda > template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ); Loading src/TNL/Containers/Algorithms/Multireduction.hpp +1 −5 Original line number Diff line number Diff line Loading @@ -33,14 +33,12 @@ namespace Algorithms { template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > void Multireduction< Devices::Host >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* result ) Loading Loading @@ -173,14 +171,12 @@ reduce( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > void Multireduction< Devices::Cuda >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ) Loading Loading @@ -218,7 +214,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult ); Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading src/TNL/Containers/Algorithms/Reduction.h +0 −8 Original line number Diff line number Diff line Loading @@ -30,24 +30,20 @@ struct Reduction< Devices::Host > template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; Loading @@ -58,24 +54,20 @@ struct Reduction< Devices::Cuda > template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; Loading Loading
src/Benchmarks/BLAS/CommonVectorOperations.hpp +17 −34 Original line number Diff line number Diff line Loading @@ -31,8 +31,7 @@ getVectorMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -49,8 +48,7 @@ getVectorMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -67,8 +65,7 @@ getVectorAbsMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -85,8 +82,7 @@ getVectorAbsMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -103,8 +99,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -121,8 +116,7 @@ getVectorL2Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > Loading @@ -146,8 +140,7 @@ getVectorLpNorm( const Vector& v, const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > Loading @@ -167,8 +160,7 @@ getVectorSum( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -188,8 +180,7 @@ getVectorDifferenceMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -209,8 +200,7 @@ getVectorDifferenceMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -230,8 +220,7 @@ getVectorDifferenceAbsMax( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > Loading @@ -251,8 +240,7 @@ getVectorDifferenceAbsMin( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > Loading @@ -272,8 +260,7 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -296,8 +283,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, return diff * diff; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ) ); } template< typename Device > Loading @@ -324,8 +310,7 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > Loading @@ -345,8 +330,7 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } template< typename Device > Loading @@ -366,8 +350,7 @@ getScalarProduct( const Vector1& v1, const auto* data2 = v2.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, ( ResultType ) 0 ); } } // namespace Benchmarks Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +3 −6 Original line number Diff line number Diff line Loading @@ -135,8 +135,7 @@ compare( const Element1* destination, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } template< typename Element, Loading @@ -153,8 +152,7 @@ containsValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, false ); } template< typename Element, Loading @@ -171,8 +169,7 @@ containsOnlyValue( const Element* data, auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); return Reduction< Devices::Cuda >::reduce( size, reduction, fetch, true ); } Loading
src/TNL/Containers/Algorithms/Multireduction.h +0 −6 Original line number Diff line number Diff line Loading @@ -32,7 +32,6 @@ struct Multireduction< Devices::Host > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * result: output array of size = n Loading @@ -40,13 +39,11 @@ struct Multireduction< Devices::Host > template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* result ); Loading @@ -62,7 +59,6 @@ struct Multireduction< Devices::Cuda > * the i-th value to be reduced from the j-th dataset * (i = 0,...,size-1; j = 0,...,n-1) * reduction: callable object representing the reduction operation * volatileReduction: callable object representing the reduction operation * size: the size of each dataset * n: number of datasets to be reduced * hostResult: output array of size = n Loading @@ -70,13 +66,11 @@ struct Multireduction< Devices::Cuda > template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > static void reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ); Loading
src/TNL/Containers/Algorithms/Multireduction.hpp +1 −5 Original line number Diff line number Diff line Loading @@ -33,14 +33,12 @@ namespace Algorithms { template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > void Multireduction< Devices::Host >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* result ) Loading Loading @@ -173,14 +171,12 @@ reduce( const Result zero, template< typename Result, typename DataFetcher, typename Reduction, typename VolatileReduction, typename Index > void Multireduction< Devices::Cuda >:: reduce( const Result zero, DataFetcher dataFetcher, const Reduction reduction, const VolatileReduction volatileReduction, const Index size, const int n, Result* hostResult ) Loading Loading @@ -218,7 +214,7 @@ reduce( const Result zero, // finish the reduction on the host auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, volatileReduction, reducedSize, n, hostResult ); Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); Loading
src/TNL/Containers/Algorithms/Reduction.h +0 −8 Original line number Diff line number Diff line Loading @@ -30,24 +30,20 @@ struct Reduction< Devices::Host > template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; Loading @@ -58,24 +54,20 @@ struct Reduction< Devices::Cuda > template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static Result reduce( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); template< typename Index, typename Result, typename ReductionOperation, typename VolatileReductionOperation, typename DataFetcher > static std::pair< Index, Result > reduceWithArgument( const Index size, ReductionOperation& reduction, VolatileReductionOperation& volatileReduction, DataFetcher& dataFetcher, const Result& zero ); }; Loading