diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.h b/src/Benchmarks/BLAS/CommonVectorOperations.h index 504da8fa6d06602382df07da05570ad08ff7aef0..ed29153688d0fce31621ee7fa95a6dc9b45174bf 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.h +++ b/src/Benchmarks/BLAS/CommonVectorOperations.h @@ -10,8 +10,6 @@ #pragma once -#include - namespace TNL { namespace Benchmarks { diff --git a/src/Benchmarks/BLAS/CommonVectorOperations.hpp b/src/Benchmarks/BLAS/CommonVectorOperations.hpp index 727a0abcf008aec3e8654ff1a8ac88715e9fc280..640fda337b5d8a8a6dcec75f081702eccb45464c 100644 --- a/src/Benchmarks/BLAS/CommonVectorOperations.hpp +++ b/src/Benchmarks/BLAS/CommonVectorOperations.hpp @@ -11,7 +11,6 @@ #pragma once #include -#include #include "CommonVectorOperations.h" namespace TNL { @@ -30,9 +29,8 @@ getVectorMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -48,9 +46,8 @@ getVectorMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -66,9 +63,8 @@ getVectorAbsMax( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -84,9 +80,8 @@ getVectorAbsMin( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -102,9 +97,7 @@ getVectorL1Norm( const Vector& v ) const auto* data = v.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -119,10 +112,8 @@ getVectorL2Norm( const Vector& v ) using IndexType = typename Vector::IndexType; const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data[ i ] * data[ i ]; }; + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -144,10 +135,8 @@ getVectorLpNorm( const Vector& v, return getVectorL2Norm< Vector, ResultType >( v ); const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data[ i ] ), p ); }; + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -165,10 +154,8 @@ getVectorSum( const Vector& v ) using IndexType = typename Vector::IndexType; const auto* data = v.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) -> ResultType { return data[ i ]; }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -186,10 +173,9 @@ getVectorDifferenceMax( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -207,10 +193,9 @@ getVectorDifferenceMin( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -228,10 +213,9 @@ getVectorDifferenceAbsMax( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::max( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::max( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Device > @@ -249,10 +233,9 @@ getVectorDifferenceAbsMin( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = TNL::min( a, b ); }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = TNL::min( a, b ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Device > @@ -270,10 +253,8 @@ getVectorDifferenceL1Norm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( data1[ i ] - data2[ i ] ); }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -295,9 +276,7 @@ getVectorDifferenceL2Norm( const Vector1& v1, auto diff = data1[ i ] - data2[ i ]; return diff * diff; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ) ); + return std::sqrt( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ) ); } template< typename Device > @@ -322,10 +301,8 @@ getVectorDifferenceLpNorm( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ), 1.0 / p ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( data1[ i ] - data2[ i ] ), p ); }; + return std::pow( Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ), 1.0 / p ); } template< typename Device > @@ -343,10 +320,8 @@ getVectorDifferenceSum( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] - data2[ i ]; }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } template< typename Device > @@ -364,10 +339,8 @@ getScalarProduct( const Vector1& v1, const auto* data1 = v1.getData(); const auto* data2 = v2.getData(); - auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), reduction, volatileReduction, fetch, ( ResultType ) 0 ); + auto fetch = [=] __cuda_callable__ ( IndexType i ) { return data1[ i ] * data2[ i ]; }; + return Containers::Algorithms::Reduction< DeviceType >::reduce( v1.getSize(), std::plus<>{}, fetch, ( ResultType ) 0 ); } } // namespace Benchmarks diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 25fa055bf59afd1dcaebb85b94af7ab58479ec5b..80c63020dcfa27b498dbf4fe6da12cfe1501a9d9 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -346,7 +346,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normCudaET = [&]() { resultDevice = lpNorm( deviceView, 3.0 ); }; - benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU legacy", l3normHost ); benchmark.time< Devices::Host >( reset1, "CPU ET", l3normHostET ); @@ -369,7 +368,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductCudaET = [&]() { resultDevice = ( deviceView, deviceView2 ); }; - #ifdef HAVE_BLAS auto scalarProductBlas = [&]() { resultHost = blasGdot( size, hostVector.getData(), 1, hostVector2.getData(), 1 ); @@ -395,38 +393,6 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif - //// - // Prefix sum - /* - std::cout << "Benchmarking prefix-sum:" << std::endl; - timer.reset(); - timer.start(); - hostVector.computePrefixSum(); - timer.stop(); - timeHost = timer.getTime(); - bandwidth = 2 * datasetSize / timer.getTime(); - std::cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - - timer.reset(); - timer.start(); - deviceVector.computePrefixSum(); - timer.stop(); - timeDevice = timer.getTime(); - bandwidth = 2 * datasetSize / timer.getTime(); - std::cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << std::endl; - std::cout << " CPU/GPU speedup: " << timeHost / timeDevice << std::endl; - - HostVector auxHostVector; - auxHostVector.setLike( deviceVector ); - auxHostVector = deviceVector; - for( int i = 0; i < size; i++ ) - if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) - { - std::cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << std::endl; - } - */ - - //// // Scalar multiplication auto multiplyHost = [&]() { @@ -435,6 +401,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto multiplyCuda = [&]() { deviceVector *= 0.5; }; +#ifdef HAVE_BLAS + auto multiplyBlas = [&]() { + blasGscal( hostVector.getSize(), (Real) 0.5, hostVector.getData(), 1 ); + }; +#endif #ifdef HAVE_CUDA auto multiplyCublas = [&]() { const Real alpha = 0.5; @@ -445,6 +416,9 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); benchmark.time< Devices::Host >( reset1, "CPU ET", multiplyHost ); +#ifdef HAVE_BLAS + benchmark.time< Devices::Host >( reset1, "CPU BLAS", multiplyBlas ); +#endif #ifdef HAVE_CUDA benchmark.time< Devices::Cuda >( reset1, "GPU ET", multiplyCuda ); benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); @@ -606,6 +580,34 @@ benchmarkVectorOperations( Benchmark & benchmark, benchmark.time< Devices::Cuda >( resetAll, "cuBLAS", addThreeVectorsCublas ); #endif + //// + // Inclusive prefix sum + auto inclusivePrefixSumHost = [&]() { + hostVector.prefixSum(); + }; + auto inclusivePrefixSumCuda = [&]() { + deviceVector.prefixSum(); + }; + benchmark.setOperation( "inclusive prefix sum", 2 * datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU ET", inclusivePrefixSumHost ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset1, "GPU ET", inclusivePrefixSumCuda ); +#endif + + //// + // Exclusive prefix sum + auto exclusivePrefixSumHost = [&]() { + hostVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); + }; + auto exclusivePrefixSumCuda = [&]() { + deviceVector.template prefixSum< Containers::Algorithms::PrefixSumType::Exclusive >(); + }; + benchmark.setOperation( "exclusive prefix sum", 2 * datasetSize ); + benchmark.time< Devices::Host >( reset1, "CPU ET", exclusivePrefixSumHost ); +#ifdef HAVE_CUDA + benchmark.time< Devices::Cuda >( reset1, "GPU ET", exclusivePrefixSumCuda ); +#endif + #ifdef HAVE_CUDA cublasDestroy( cublasHandle ); #endif diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 42b19814e64ad4b5c793752fbe7b5345d8860a0a..683a18376276c4b0dbd194329226e9a517a4af12 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -17,7 +17,6 @@ #include "Logging.h" #include -#include #include #include @@ -35,24 +34,24 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; - struct BenchmarkResult { using HeaderElements = Logging::HeaderElements; using RowElements = Logging::RowElements; - double bandwidth = std::numeric_limits::quiet_NaN(); double time = std::numeric_limits::quiet_NaN(); + double stddev = std::numeric_limits::quiet_NaN(); + double bandwidth = std::numeric_limits::quiet_NaN(); double speedup = std::numeric_limits::quiet_NaN(); virtual HeaderElements getTableHeader() const { - return HeaderElements({"bandwidth", "time", "speedup"}); + return HeaderElements({ "time", "stddev", "stddev/time", "bandwidth", "speedup" }); } virtual RowElements getRowElements() const { - return RowElements({ bandwidth, time, speedup }); + return RowElements({ time, stddev, stddev / time, bandwidth, speedup }); } }; @@ -65,18 +64,17 @@ public: using Logging::MetadataMap; using Logging::MetadataColumns; using SolverMonitorType = Solvers::IterativeSolverMonitor< double, int >; - + Benchmark( int loops = 10, bool verbose = true ) : Logging(verbose), loops(loops) {} - + static void configSetup( Config::ConfigDescription& config ) { config.addEntry< int >( "loops", "Number of iterations for every computation.", 10 ); config.addEntry< bool >( "reset", "Call reset function between loops.", true ); config.addEntry< double >( "min-time", "Minimal real time in seconds for every computation.", 0.0 ); - config.addEntry< bool >( "timing", "Turns off (or on) the timing (for the purpose of profiling).", true ); config.addEntry< int >( "verbose", "Verbose mode, the higher number the more verbosity.", 1 ); } @@ -85,7 +83,6 @@ public: this->loops = parameters.getParameter< int >( "loops" ); this->reset = parameters.getParameter< bool >( "reset" ); this->minTime = parameters.getParameter< double >( "min-time" ); - this->timing = parameters.getParameter< bool >( "timing" ); const int verbose = parameters.getParameter< int >( "verbose" ); Logging::setVerbose( verbose ); } @@ -96,7 +93,7 @@ public: { this->loops = loops; } - + void setMinTime( const double& minTime ) { this->minTime = minTime; @@ -121,7 +118,6 @@ public: metadata["loops"] = convertToString(loops); metadata["reset"] = convertToString( reset ); metadata["minimal test time"] = convertToString( minTime ); - metadata["timing"] = convertToString( timing ); writeMetadata( metadata ); } @@ -203,33 +199,22 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + result.stddev = std::numeric_limits::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - if( this->timing ) - if( this->reset ) - result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + if( this->reset ) + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - if( this->reset ) - result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { - if( this->timing ) - if( this->reset ) - result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); + if( this->reset ) + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, reset, loops, minTime, verbose, monitor ); else - if( this->reset ) - result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } this->performedLoops = functionTimer.getPerformedLoops(); } @@ -248,7 +233,7 @@ public: return this->baseTime; } - template< typename Device, + template< typename Device, typename ResetFunction, typename ComputeFunction, typename... NextComputations > @@ -272,21 +257,16 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits::quiet_NaN(); + result.stddev = std::numeric_limits::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - if( this->timing ) - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } else { - if( this->timing ) - result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); - else - result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); + std::tie( result.time, result.stddev ) = functionTimer.timeFunction( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { @@ -304,7 +284,7 @@ public: return this->baseTime; } - template< typename Device, + template< typename Device, typename ComputeFunction, typename... NextComputations > inline double @@ -345,7 +325,6 @@ protected: double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; - bool timing = true; bool reset = true; SolverMonitorType monitor; }; diff --git a/src/Benchmarks/FunctionTimer.h b/src/Benchmarks/FunctionTimer.h index a76ebd5589d38aac5d53687f6c7e7f48e732f8ff..1edd6120476f5f50bf3eb714ca3ea1bd8a8ca4aa 100644 --- a/src/Benchmarks/FunctionTimer.h +++ b/src/Benchmarks/FunctionTimer.h @@ -17,6 +17,7 @@ #include #include +#include #include namespace TNL { @@ -25,110 +26,94 @@ namespace Benchmarks { template< typename Device > class FunctionTimer { - public: - using DeviceType = Device; - - template< bool timing, - typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - double - timeFunction( ComputeFunction compute, - ResetFunction reset, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor(), - bool performReset = true ) +public: + // returns a pair of (mean, stddev) where mean is the arithmetic mean of the + // computation times and stddev is the sample standard deviation + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + std::pair< double, double > + timeFunction( ComputeFunction compute, + ResetFunction reset, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + if( verbose > 1 ) + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + Containers::Vector< double > results( maxLoops ); + results.setValue( 0.0 ); + + for( loops = 0; + loops < maxLoops || sum( results ) < minTime; + loops++ ) { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - if( verbose > 1 ) - monitor.setTimer( timer ); - - // warm up + // abuse the monitor's "time" for loops + monitor.setTime( loops + 1 ); reset(); - compute(); - // If we do not perform reset function and don't need - // the monitor, the timer is not interrupted after each loop. - if( ! performReset && verbose < 2 ) - { - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - if( timing ) - timer.start(); - - for( loops = 0; - loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) - compute(); - // Explicit synchronization of the CUDA device -#ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); -#endif - if( timing ) - timer.stop(); - } - else - { - for( loops = 0; - loops < maxLoops || ( timing && timer.getRealTime() < minTime ); - ++loops) - { - // abuse the monitor's "time" for loops - monitor.setTime( loops + 1 ); - reset(); - - // Explicit synchronization of the CUDA device + // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - if( timing ) - timer.start(); - compute(); + + // reset timer before each computation + timer.reset(); + timer.start(); + compute(); #ifdef HAVE_CUDA - if( std::is_same< Device, Devices::Cuda >::value ) - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - if( timing ) - timer.stop(); - } - } - if( timing ) - return timer.getRealTime() / ( double ) loops; - else - return std::numeric_limits::quiet_NaN(); - } + timer.stop(); - template< bool timing, - typename ComputeFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > - double - timeFunction( ComputeFunction compute, - int maxLoops, - const double& minTime, - int verbose = 1, - Monitor && monitor = Monitor() ) - { - auto noReset = [] () {}; - return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); + results[ loops ] = timer.getRealTime(); } - int getPerformedLoops() const - { - return this->loops; + const double mean = sum( results ) / (double) loops; + if( loops > 1 ) { + const double stddev = 1.0 / std::sqrt( loops - 1 ) * l2Norm( results - mean ); + return std::make_pair( mean, stddev ); } - - protected: - int loops; + else { + const double stddev = std::numeric_limits::quiet_NaN(); + return std::make_pair( mean, stddev ); + } + } + + // returns a pair of (mean, stddev) where mean is the arithmetic mean of the + // computation times and stddev is the sample standard deviation + template< typename ComputeFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + std::pair< double, double > + timeFunction( ComputeFunction compute, + int maxLoops, + const double& minTime, + int verbose = 1, + Monitor && monitor = Monitor() ) + { + auto noReset = [] () {}; + return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor ); + } + + int getPerformedLoops() const + { + return this->loops; + } + +protected: + int loops; }; } // namespace Benchmarks diff --git a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h index 33dff1ded6f5968f812236d2dd65b23a2f1c2fc9..944e7f73fe4d9cdf0eb12496b3dc1ee02ccdd5f7 100644 --- a/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h +++ b/src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation.h @@ -259,12 +259,21 @@ bool solveHeatEquationCuda( const Config::ParameterContainer& parameters, const Index dofsCount = gridXSize * gridYSize; dim3 cudaUpdateBlocks( dofsCount / 256 + ( dofsCount % 256 != 0 ) ); dim3 cudaUpdateBlockSize( 256 ); - + /**** * Initiation */ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out some warnings... + #ifdef __NVCC__ + Real* u = new Real[ static_cast(dofsCount) ]; + Real* aux = new Real[ static_cast(dofsCount) ]; + #else Real* u = new Real[ dofsCount ]; Real* aux = new Real[ dofsCount ]; + #endif Real* max_du = new Real[ cudaUpdateBlocks.x ]; if( ! u || ! aux ) { diff --git a/src/Benchmarks/Logging.h b/src/Benchmarks/Logging.h index b10ab7199337b68fe5c51181b4924fcc8a3f89b6..61608d364e769fe4b6e68a0691957745ee805496 100644 --- a/src/Benchmarks/Logging.h +++ b/src/Benchmarks/Logging.h @@ -16,225 +16,224 @@ #include #include #include +#include #include #include +#include + namespace TNL { - namespace Benchmarks { +namespace Benchmarks { class Logging { - public: - using MetadataElement = std::pair< const char*, String >; - using MetadataMap = std::map< const char*, String >; - using MetadataColumns = std::vector; - - using HeaderElements = std::vector< String >; - using RowElements = std::vector< double >; - - Logging( int verbose = true ) - : verbose(verbose) - {} - - void - setVerbose( int verbose) - { - this->verbose = verbose; - } - - void - writeTitle( const String & title ) - { - if( verbose ) - std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; - log << ": title = " << title << std::endl; - } - - void - writeMetadata( const MetadataMap & metadata ) - { +public: + using MetadataElement = std::pair< const char*, String >; + using MetadataMap = std::map< const char*, String >; + using MetadataColumns = std::vector; + + using HeaderElements = std::vector< String >; + using RowElements = std::vector< double >; + + Logging( int verbose = true ) + : verbose(verbose) + {} + + void + setVerbose( int verbose) + { + this->verbose = verbose; + } + + void + writeTitle( const String & title ) + { + if( verbose ) + std::cout << std::endl << "== " << title << " ==" << std::endl << std::endl; + log << ": title = " << title << std::endl; + } + + void + writeMetadata( const MetadataMap & metadata ) + { + if( verbose ) + std::cout << "properties:" << std::endl; + + for( auto & it : metadata ) { if( verbose ) - std::cout << "properties:" << std::endl; - - for( auto & it : metadata ) { - if( verbose ) - std::cout << " " << it.first << " = " << it.second << std::endl; - log << ": " << it.first << " = " << it.second << std::endl; - } - if( verbose ) - std::cout << std::endl; + std::cout << " " << it.first << " = " << it.second << std::endl; + log << ": " << it.first << " = " << it.second << std::endl; } - - void - writeTableHeader( const String & spanningElement, - const HeaderElements & subElements ) - { - if( verbose && header_changed ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.first; - } - - // spanning element is printed as usual column to stdout, - // but is excluded from header - std::cout << std::setw( 15 ) << ""; - - for( auto & it : subElements ) { - std::cout << std::setw( 15 ) << it; - } - std::cout << std::endl; - - header_changed = false; - } - - // initial indent string - header_indent = "!"; - log << std::endl; + if( verbose ) + std::cout << std::endl; + } + + void + writeTableHeader( const String & spanningElement, + const HeaderElements & subElements ) + { + if( verbose && header_changed ) { for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; + std::cout << std::setw( 20 ) << it.first; } - // dump stacked spanning columns - if( horizontalGroups.size() > 0 ) - while( horizontalGroups.back().second <= 0 ) { - horizontalGroups.pop_back(); - header_indent.pop_back(); - } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } + // spanning element is printed as usual column to stdout, + // but is excluded from header + std::cout << std::setw( 15 ) << ""; - log << header_indent << " " << spanningElement << std::endl; for( auto & it : subElements ) { - log << header_indent << "! " << it << std::endl; + std::cout << std::setw( 15 ) << it; } + std::cout << std::endl; - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second--; - header_indent.pop_back(); - } + header_changed = false; } - void - writeTableRow( const String & spanningElement, - const RowElements & subElements ) - { - if( verbose ) { - for( auto & it : metadataColumns ) { - std::cout << std::setw( 20 ) << it.second; - } - // spanning element is printed as usual column to stdout - std::cout << std::setw( 15 ) << spanningElement; - for( auto & it : subElements ) { - std::cout << std::setw( 15 ); - if( it != 0.0 )std::cout << it; - else std::cout << "N/A"; - } - std::cout << std::endl; - } - - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - - // benchmark data are indented - const String indent = " "; - for( auto & it : subElements ) { - if( it != 0.0 ) log << indent << it << std::endl; - else log << indent << "N/A" << std::endl; - } + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; } - void - writeErrorMessage( const char* msg, - int colspan = 1 ) - { - // initial indent string - header_indent = "!"; - log << std::endl; - for( auto & it : metadataColumns ) { - log << header_indent << " " << it.first << std::endl; - } - - // make sure there is a header column for the message - if( horizontalGroups.size() == 0 ) - horizontalGroups.push_back( {"", 1} ); - - // dump stacked spanning columns + // dump stacked spanning columns + if( horizontalGroups.size() > 0 ) while( horizontalGroups.back().second <= 0 ) { horizontalGroups.pop_back(); header_indent.pop_back(); } - for( size_t i = 0; i < horizontalGroups.size(); i++ ) { - if( horizontalGroups[ i ].second > 0 ) { - log << header_indent << " " << horizontalGroups[ i ].first << std::endl; - header_indent += "!"; - } - } - if( horizontalGroups.size() > 0 ) { - horizontalGroups.back().second -= colspan; - header_indent.pop_back(); + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; } + } - // only when changed (the header has been already adjusted) - // print each element on separate line - for( auto & it : metadataColumns ) { - log << it.second << std::endl; - } - log << msg << std::endl; + log << header_indent << " " << spanningElement << std::endl; + for( auto & it : subElements ) { + log << header_indent << "! " << it << std::endl; } - void - closeTable() - { - log << std::endl; - header_indent = body_indent = ""; - header_changed = true; - horizontalGroups.clear(); + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second--; + header_indent.pop_back(); } + } - bool save( std::ostream & logFile ) - { - closeTable(); - logFile << log.str(); - if( logFile.good() ) { - log.str() = ""; - return true; + void + writeTableRow( const String & spanningElement, + const RowElements & subElements ) + { + if( verbose ) { + for( auto & it : metadataColumns ) { + std::cout << std::setw( 20 ) << it.second; + } + // spanning element is printed as usual column to stdout + std::cout << std::setw( 15 ) << spanningElement; + for( auto & it : subElements ) { + std::cout << std::setw( 15 ); + if( it != 0.0 )std::cout << it; + else std::cout << "N/A"; } - return false; + std::cout << std::endl; } - protected: - - // manual double -> String conversion with fixed precision - static String - _to_string( double num, int precision = 0, bool fixed = false ) - { - std::stringstream str; - if( fixed ) - str << std::fixed; - if( precision ) - str << std::setprecision( precision ); - str << num; - return String( str.str().data() ); + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; } - std::stringstream log; - std::string header_indent; - std::string body_indent; - - int verbose; - MetadataColumns metadataColumns; - bool header_changed = true; - std::vector< std::pair< String, int > > horizontalGroups; -}; + // benchmark data are indented + const String indent = " "; + for( auto & it : subElements ) { + if( it != 0.0 ) log << indent << it << std::endl; + else log << indent << "N/A" << std::endl; + } + } + + void + writeErrorMessage( const char* msg, + int colspan = 1 ) + { + // initial indent string + header_indent = "!"; + log << std::endl; + for( auto & it : metadataColumns ) { + log << header_indent << " " << it.first << std::endl; + } + // make sure there is a header column for the message + if( horizontalGroups.size() == 0 ) + horizontalGroups.push_back( {"", 1} ); - } // namespace Benchmarks -} // namespace TNL + // dump stacked spanning columns + while( horizontalGroups.back().second <= 0 ) { + horizontalGroups.pop_back(); + header_indent.pop_back(); + } + for( size_t i = 0; i < horizontalGroups.size(); i++ ) { + if( horizontalGroups[ i ].second > 0 ) { + log << header_indent << " " << horizontalGroups[ i ].first << std::endl; + header_indent += "!"; + } + } + if( horizontalGroups.size() > 0 ) { + horizontalGroups.back().second -= colspan; + header_indent.pop_back(); + } + // only when changed (the header has been already adjusted) + // print each element on separate line + for( auto & it : metadataColumns ) { + log << it.second << std::endl; + } + log << msg << std::endl; + } + + void + closeTable() + { + log << std::endl; + header_indent = body_indent = ""; + header_changed = true; + horizontalGroups.clear(); + } + + bool save( std::ostream & logFile ) + { + closeTable(); + logFile << log.str(); + if( logFile.good() ) { + log.str() = ""; + return true; + } + return false; + } + +protected: + // manual double -> String conversion with fixed precision + static String + _to_string( double num, int precision = 0, bool fixed = false ) + { + std::stringstream str; + if( fixed ) + str << std::fixed; + if( precision ) + str << std::setprecision( precision ); + str << num; + return String( str.str().data() ); + } + + std::stringstream log; + std::string header_indent; + std::string body_indent; + + int verbose; + MetadataColumns metadataColumns; + bool header_changed = true; + std::vector< std::pair< String, int > > horizontalGroups; +}; +} // namespace Benchmarks +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp index 8623c99de0aa29946a1f5bd2c6282fd713b0a938..46618fcd7228c1f055a82fd92d36fc50ee852348 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsCuda.hpp @@ -19,7 +19,6 @@ #include #include #include -#include namespace TNL { namespace Containers { @@ -133,10 +132,8 @@ compare( const Element1* destination, TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( destination[ i ] == source[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return destination[ i ] == source[ i ]; }; + return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true ); } template< typename Element, @@ -151,10 +148,8 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, (Index) 0, "" ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a |= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a |= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, false ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; + return Reduction< Devices::Cuda >::reduce( size, std::logical_or<>{}, fetch, false ); } template< typename Element, @@ -169,10 +164,8 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); - auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return ( data[ i ] == value ); }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Reduction< Devices::Cuda >::reduce( size, reduction, volatileReduction, fetch, true ); + auto fetch = [=] __cuda_callable__ ( Index i ) -> bool { return data[ i ] == value; }; + return Reduction< Devices::Cuda >::reduce( size, std::logical_and<>{}, fetch, true ); } diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp index f7c164d066bf6b8d7ece3bfe32cf0759de646d6d..3a54a862f0e8d7c479a3c2fe1a42d536550ac49c 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsHost.hpp @@ -17,7 +17,6 @@ #include #include #include -#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp index 5bf04dbaf5374cf564932f93b64aabdc5dba4a6a..4113bbcd90f0edce53d143cf65996a392c2a91b4 100644 --- a/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp +++ b/src/TNL/Containers/Algorithms/ArrayOperationsMIC.hpp @@ -18,7 +18,6 @@ #include #include #include -#include #include namespace TNL { diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index e8fc7f8bb913dbc0e280f5d3c6b60d77498cd719..e67c11b419dca3ae83706d43c56f5f282aba4beb 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -12,14 +12,11 @@ #pragma once -#ifdef HAVE_CUDA -#include -#endif - #include #include #include #include +#include namespace TNL { namespace Containers { @@ -32,157 +29,128 @@ namespace Algorithms { * architecture so that there are no local memory spills. */ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2 -static constexpr int Multireduction_registersPerThread = 38; // empirically determined optimal value +static constexpr int Multireduction_registersPerThread = 32; // empirically determined optimal value // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) - static constexpr int Multireduction_minBlocksPerMultiprocessor = 6; + static constexpr int Multireduction_minBlocksPerMultiprocessor = 8; #else static constexpr int Multireduction_minBlocksPerMultiprocessor = 4; #endif -template< int blockSizeX, typename Operation, typename Index > +template< int blockSizeX, + typename Result, + typename DataFetcher, + typename Reduction, + typename Index > __global__ void __launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor ) -CudaMultireductionKernel( Operation operation, - const int n, +CudaMultireductionKernel( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* output ) + const int n, + Result* output ) { - typedef Index IndexType; - typedef typename Operation::ResultType ResultType; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); - - /*** - * Get thread id (tid) and global element id (gid). - * gridSizeX is the number of elements in the direction of x-axis - * processed by all blocks at the same time. - */ - const IndexType tid = threadIdx.y * blockDim.x + threadIdx.x; - IndexType gid = blockIdx.x * blockDim.x + threadIdx.x; - const IndexType gridSizeX = blockDim.x * gridDim.x; - - /*** - * Shift input1 and output pointers. - */ - const IndexType y = blockIdx.y * blockDim.y + threadIdx.y; - if( y < n ) { - input1 += y * ldInput1; - output += y * gridDim.x; - } - else - return; - - /*** - * Start with the sequential reduction and push the - * result into the shared memory. - */ - sdata[ tid ] = operation.initialValue(); - while( gid + 4 * gridSizeX < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 2 * gridSizeX, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 3 * gridSizeX, input1, input2 ); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + + // Get the thread id (tid), global thread id (gid) and gridSize. + const Index tid = threadIdx.y * blockDim.x + threadIdx.x; + Index gid = blockIdx.x * blockDim.x + threadIdx.x; + const Index gridSizeX = blockDim.x * gridDim.x; + + // Get the dataset index. + const int y = blockIdx.y * blockDim.y + threadIdx.y; + if( y >= n ) return; + + sdata[ tid ] = zero; + + // Start with the sequential reduction and push the result into the shared memory. + while( gid + 4 * gridSizeX < size ) { + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSizeX, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSizeX, y ) ); gid += 4 * gridSizeX; } - while( gid + 2 * gridSizeX < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSizeX, input1, input2 ); + while( gid + 2 * gridSizeX < size ) { + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSizeX, y ) ); gid += 2 * gridSizeX; } - while( gid < size ) - { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); + while( gid < size ) { + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid, y ) ); gid += gridSizeX; } __syncthreads(); - - //printf( "1: tid %d data %f \n", tid, sdata[ tid ] ); - - /*** - * Perform the parallel reduction. - */ + // Perform the parallel reduction. if( blockSizeX >= 1024 ) { - if( threadIdx.x < 512 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 512 ] ); - } + if( threadIdx.x < 512 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSizeX >= 512 ) { - if( threadIdx.x < 256 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 256 ] ); - } + if( threadIdx.x < 256 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSizeX >= 256 ) { - if( threadIdx.x < 128 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 128 ] ); - } + if( threadIdx.x < 128 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSizeX >= 128 ) { - if( threadIdx.x < 64 ) { - operation.commonReduction( sdata[ tid ], sdata[ tid + 64 ] ); - } + if( threadIdx.x < 64 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } - /*** - * This runs in one warp so it is synchronized implicitly. - * - * When the blockSizeX is less then or equal to the warp size, the shared memory - * must be at least 2 * blockSizeX elements per block, otherwise unallocated memory - * will be accessed !!! - */ + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( threadIdx.x < 32 ) { - volatile ResultType* vsdata = sdata; - if( blockSizeX >= 64 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); - } - if( blockSizeX >= 32 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSizeX >= 16 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSizeX >= 8 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSizeX >= 4 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSizeX >= 2 ) { - operation.commonReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); - } + if( blockSizeX >= 64 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); + // Note that here we do not have to check if tid < 16 etc, because we have + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSizeX >= 32 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); + if( blockSizeX >= 16 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); + if( blockSizeX >= 8 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); + if( blockSizeX >= 4 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); + if( blockSizeX >= 2 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); } - /*** - * Store the result back in the global memory. - */ + // Store the result back in the global memory. if( threadIdx.x == 0 ) { - output[ blockIdx.x ] = sdata[ tid ]; + output[ blockIdx.x + y * gridDim.x ] = sdata[ tid ]; } } +#endif -template< typename Operation, typename Index > +template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > int -CudaMultireductionKernelLauncher( Operation& operation, - const int n, +CudaMultireductionKernelLauncher( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType*& output ) + const int n, + Result*& output ) { - typedef typename Operation::ResultType ResultType; - +#ifdef HAVE_CUDA // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -197,7 +165,7 @@ CudaMultireductionKernelLauncher( Operation& operation, / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread ); const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); dim3 blockSize, gridSize; - + // version A: max 16 rows of threads blockSize.y = TNL::min( n, 16 ); @@ -231,87 +199,86 @@ CudaMultireductionKernelLauncher( Operation& operation, // create reference to the reduction buffer singleton and set size // (make an overestimate to avoid reallocation on every call if n is increased by 1 each time) - const size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( ResultType ); + const std::size_t buf_size = 8 * ( n / 8 + 1 ) * desGridSizeX * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); + output = cudaReductionBuffer.template getData< Result >(); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds const Index shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * blockSize.y * sizeof( ResultType ) - : blockSize.x * blockSize.y * sizeof( ResultType ); - - //cout << "Multireduction of " << n << " datasets, block size (" << blockSize.x << "," << blockSize.y << "), grid size (" << gridSize.x << "," << gridSize.y << "), shmem " << shmem < - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 256: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 128: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 64: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 32: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 16: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 8: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 4: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( operation, n,size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 2: - cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Operation, Index >, cudaFuncCachePreferShared); + cudaFuncSetCacheConfig(CudaMultireductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); CudaMultireductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( operation, n, size, input1, ldInput1, input2, output); + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, n, output ); break; case 1: throw std::logic_error( "blockSize should not be 1." ); default: throw std::logic_error( "Block size is " + std::to_string(blockSize.x) + " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); } + cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; // return the size of the output array on the CUDA device return gridSize.x; -} +#else + throw Exceptions::CudaSupportMissing(); #endif +} } // namespace Algorithms } // namespace Containers diff --git a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h index 40a662b0856f726370537a8f9431e77126d3221f..ae3bb84de5c167bd40359908d60feb99f8947032 100644 --- a/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h +++ b/src/TNL/Containers/Algorithms/CudaPrefixSumKernel.h @@ -15,7 +15,6 @@ #include #include #include -#include #include namespace TNL { @@ -25,24 +24,21 @@ namespace Algorithms { #ifdef HAVE_CUDA template< typename Real, - typename Operation, - typename VolatileOperation, + typename Reduction, typename Index > __global__ void cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, - Operation operation, - VolatileOperation volatileOperation, + Reduction reduction, const Real zero, const Index size, - const Index elementsInBlock, + const int elementsInBlock, const Real* input, Real* output, - Real* auxArray, - const Real gridShift ) + Real* auxArray ) { Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); - volatile Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; - volatile Real* warpSums = &auxData[ blockDim.x ]; + Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; + Real* warpSums = &auxData[ blockDim.x ]; const Index lastElementIdx = size - blockIdx.x * elementsInBlock; const Index lastElementInBlock = TNL::min( lastElementIdx, elementsInBlock ); @@ -50,8 +46,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, /*** * Load data into the shared memory. */ - const Index blockOffset = blockIdx.x * elementsInBlock; - Index idx = threadIdx.x; + const int blockOffset = blockIdx.x * elementsInBlock; + int idx = threadIdx.x; if( prefixSumType == PrefixSumType::Exclusive ) { if( idx == 0 ) @@ -70,8 +66,6 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, idx += blockDim.x; } } - if( blockIdx.x == 0 && threadIdx.x == 0 ) - operation( sharedData[ 0 ], gridShift ); /*** * Perform the sequential prefix-sum. @@ -87,14 +81,15 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; } - Index chunkPointer( 1 ); + int chunkPointer = 1; while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { - operation( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = + reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx.x ] = - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; + sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer++; } @@ -103,9 +98,11 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, */ const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) + for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) - volatileOperation( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); + auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); + __syncwarp(); + } if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx.x ]; @@ -115,29 +112,32 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) + for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) - volatileOperation( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); + warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); + __syncwarp(); + } __syncthreads(); /**** * Shift the warp prefix-sums. */ if( warpIdx > 0 ) - volatileOperation( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); + auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], warpSums[ warpIdx - 1 ] ); + __syncthreads(); /*** * Store the result back in global memory. */ - __syncthreads(); idx = threadIdx.x; while( idx < elementsInBlock && blockOffset + idx < size ) { - const Index chunkIdx = idx / chunkSize; + const int chunkIdx = idx / chunkSize; Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; - operation( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); + sharedData[ Devices::Cuda::getInterleaving( idx ) ] = + reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; idx += blockDim.x; } @@ -147,10 +147,8 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, { if( prefixSumType == PrefixSumType::Exclusive ) { - Real aux = zero; - operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ] ); - operation( aux, sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); - auxArray[ blockIdx.x ] = aux; + auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], + sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); } else auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; @@ -158,26 +156,26 @@ cudaFirstPhaseBlockPrefixSum( const PrefixSumType prefixSumType, } template< typename Real, - typename Operation, + typename Reduction, typename Index > __global__ void -cudaSecondPhaseBlockPrefixSum( Operation operation, +cudaSecondPhaseBlockPrefixSum( Reduction reduction, const Index size, - const Index elementsInBlock, - Real gridShift, + const int elementsInBlock, + const Index gridIdx, + const Index maxGridSize, const Real* auxArray, - Real* data ) + Real* data, + Real shift ) { - if( blockIdx.x > 0 ) + if( gridIdx > 0 || blockIdx.x > 0 ) + shift = reduction( shift, auxArray[ gridIdx * maxGridSize + blockIdx.x - 1 ] ); + const int readOffset = blockIdx.x * elementsInBlock; + int readIdx = threadIdx.x; + while( readIdx < elementsInBlock && readOffset + readIdx < size ) { - const Real shift = auxArray[ blockIdx.x - 1 ]; - const Index readOffset = blockIdx.x * elementsInBlock; - Index readIdx = threadIdx.x; - while( readIdx < elementsInBlock && readOffset + readIdx < size ) - { - operation( data[ readIdx + readOffset ], shift ); - readIdx += blockDim.x; - } + data[ readIdx + readOffset ] = reduction( data[ readIdx + readOffset ], shift ); + readIdx += blockDim.x; } } @@ -186,191 +184,209 @@ template< PrefixSumType prefixSumType, typename Index > struct CudaPrefixSumKernelLauncher { - template< typename Operation, - typename VolatileOperation > + /**** + * \brief Performs both phases of prefix sum. + * + * \param size Number of elements to be scanned. + * \param deviceInput Pointer to input data on GPU. + * \param deviceOutput Pointer to output array on GPU, can be the same as input. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param zero Neutral element for given reduction operation, i.e. value such that + * `reduction(zero, x) == x` for any `x`. + * \param blockSize The CUDA block size to be used for kernel launch. + */ + template< typename Reduction > static void - cudaRecursivePrefixSum( PrefixSumType prefixSumType_, - Operation& operation, - VolatileOperation& volatileOperation, - const Real& zero, - const Index size, - const Index blockSize, - const Index elementsInBlock, - Real& gridShift, - const Real* input, - Real* output ) + perform( const Index size, + const Real* deviceInput, + Real* deviceOutput, + Reduction& reduction, + const Real zero, + const int blockSize = 256 ) { + const auto blockShifts = performFirstPhase( + size, + deviceInput, + deviceOutput, + reduction, + zero, + blockSize ); + performSecondPhase( + size, + deviceOutput, + blockShifts.getData(), + reduction, + zero, + blockSize ); + } + + /**** + * \brief Performs the first phase of prefix sum. + * + * \param size Number of elements to be scanned. + * \param deviceInput Pointer to input data on GPU. + * \param deviceOutput Pointer to output array on GPU, can be the same as input. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param zero Neutral value for given reduction operation, i.e. value such that + * `reduction(zero, x) == x` for any `x`. + * \param blockSize The CUDA block size to be used for kernel launch. + */ + template< typename Reduction > + static auto + performFirstPhase( const Index size, + const Real* deviceInput, + Real* deviceOutput, + Reduction& reduction, + const Real zero, + const int blockSize = 256 ) + { + // compute the number of grids + const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - const Index auxArraySize = numberOfBlocks; - - Array< Real, Devices::Cuda > auxArray1, auxArray2; - auxArray1.setSize( auxArraySize ); - auxArray2.setSize( auxArraySize ); - - /**** - * Setup block and grid size. - */ - dim3 cudaBlockSize( 0 ), cudaGridSize( 0 ); - cudaBlockSize.x = blockSize; - cudaGridSize.x = roundUpDivision( size, elementsInBlock ); - - /**** - * Run the kernel. - */ - const std::size_t sharedDataSize = elementsInBlock + - elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; - const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); - cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> - ( prefixSumType_, - operation, - volatileOperation, - zero, - size, - elementsInBlock, - input, - output, - auxArray1.getData(), - gridShift ); - TNL_CHECK_CUDA_DEVICE; + const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); + //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; + // allocate array for the block sums + Array< Real, Devices::Cuda > blockSums; + blockSums.setSize( numberOfBlocks ); - //std::cerr << " auxArray1 = " << auxArray1 << std::endl; - /*** - * In auxArray1 there is now a sum of numbers in each block. - * We must compute prefix-sum of auxArray1 and then shift - * each block. - */ - Real gridShift2 = zero; - if( numberOfBlocks > 1 ) - cudaRecursivePrefixSum( PrefixSumType::Inclusive, - operation, - volatileOperation, - zero, - numberOfBlocks, - blockSize, - elementsInBlock, - gridShift2, - auxArray1.getData(), - auxArray2.getData() ); - - //std::cerr << " auxArray2 = " << auxArray2 << std::endl; - cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> - ( operation, - size, - elementsInBlock, - gridShift, - auxArray2.getData(), - output ); - TNL_CHECK_CUDA_DEVICE; + // loop over all grids + for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { + // compute current grid size and size of data to be scanned + const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; + Index currentSize = size - gridOffset; + if( currentSize / elementsInBlock > maxGridSize() ) + currentSize = maxGridSize() * elementsInBlock; + //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; + + // setup block and grid size + dim3 cudaBlockSize, cudaGridSize; + cudaBlockSize.x = blockSize; + cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); + + // run the kernel + const std::size_t sharedDataSize = elementsInBlock + + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; + const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); + cudaFirstPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize, sharedMemory >>> + ( prefixSumType, + reduction, + zero, + currentSize, + elementsInBlock, + &deviceInput[ gridOffset ], + &deviceOutput[ gridOffset ], + &blockSums[ gridIdx * maxGridSize() ] ); + } - cudaMemcpy( &gridShift, - &auxArray2[ auxArraySize - 1 ], - sizeof( Real ), - cudaMemcpyDeviceToHost ); - //std::cerr << "gridShift = " << gridShift << std::endl; + // synchronize the null-stream after all grids + cudaStreamSynchronize(0); TNL_CHECK_CUDA_DEVICE; + + // blockSums now contains sums of numbers in each block. The first phase + // ends by computing prefix-sum of this array. + if( numberOfBlocks > 1 ) { + CudaPrefixSumKernelLauncher< PrefixSumType::Inclusive, Real, Index >::perform( + blockSums.getSize(), + blockSums.getData(), + blockSums.getData(), + reduction, + zero, + blockSize ); + } + + // Store the number of CUDA grids for the purpose of unit testing, i.e. + // to check if we test the algorithm with more than one CUDA grid. + gridsCount() = numberOfGrids; + + // blockSums now contains shift values for each block - to be used in the second phase + return blockSums; } /**** - * \brief Starts prefix sum in CUDA. + * \brief Performs the seocond phase of prefix sum. * - * \tparam Operation operation to be performed on particular elements - addition usually - * \tparam VolatileOperation - volatile version of Operation - * \param size is number of elements to be scanned - * \param blockSize is CUDA block size - * \param deviceInput is pointer to input data on GPU - * \param deviceOutput is pointer to resulting array, can be the same as input - * \param operation is instance of Operation - * \param volatileOperation is instance of VolatileOperation - * \param zero is neutral element for given Operation + * \param size Number of elements to be scanned. + * \param deviceOutput Pointer to output array on GPU. + * \param blockShifts Pointer to a GPU array containing the block shifts. It is the + * result of the first phase. + * \param reduction Symmetric binary function representing the reduction operation + * (usually addition, i.e. an instance of \ref std::plus). + * \param shift A constant shifting all elements of the array (usually `zero`, i.e. + * the neutral value). + * \param blockSize The CUDA block size to be used for kernel launch. */ - template< typename Operation, - typename VolatileOperation > + template< typename Reduction > static void - start( const Index size, - const Index blockSize, - const Real *deviceInput, - Real* deviceOutput, - Operation& operation, - VolatileOperation& volatileOperation, - const Real& zero ) + performSecondPhase( const Index size, + Real* deviceOutput, + const Real* blockShifts, + Reduction& reduction, + const Real shift, + const Index blockSize = 256 ) { - /**** - * Compute the number of grids - */ - const Index elementsInBlock = 8 * blockSize; + // compute the number of grids + const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - //const auto maxGridSize = 3; //Devices::Cuda::getMaxGridSize(); - const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize ); - Real gridShift = zero; - //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; + const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); - /**** - * Loop over all grids. - */ - for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) - { - /**** - * Compute current grid size and size of data to be scanned - */ - const Index gridOffset = gridIdx * maxGridSize * elementsInBlock; + // loop over all grids + for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { + // compute current grid size and size of data to be scanned + const Index gridOffset = gridIdx * maxGridSize() * elementsInBlock; Index currentSize = size - gridOffset; - if( currentSize / elementsInBlock > maxGridSize ) - currentSize = maxGridSize * elementsInBlock; - + if( currentSize / elementsInBlock > maxGridSize() ) + currentSize = maxGridSize() * elementsInBlock; //std::cerr << "GridIdx = " << gridIdx << " grid size = " << currentSize << std::endl; - cudaRecursivePrefixSum( prefixSumType, - operation, - volatileOperation, - zero, - currentSize, - blockSize, - elementsInBlock, - gridShift, - &deviceInput[ gridOffset ], - &deviceOutput[ gridOffset ] ); - TNL_CHECK_CUDA_DEVICE; + + // setup block and grid size + dim3 cudaBlockSize, cudaGridSize; + cudaBlockSize.x = blockSize; + cudaGridSize.x = roundUpDivision( currentSize, elementsInBlock ); + + // run the kernel + cudaSecondPhaseBlockPrefixSum<<< cudaGridSize, cudaBlockSize >>> + ( reduction, + size, + elementsInBlock, + gridIdx, + (Index) maxGridSize(), + blockShifts, + &deviceOutput[ gridOffset ], + shift ); } - /*** - * Store the number of CUDA grids for a purpose of unit testing, i.e. - * to check if we test the algorithm with more than one CUDA grid. - */ - gridsCount = numberOfGrids; + // synchronize the null-stream after all grids + cudaStreamSynchronize(0); + TNL_CHECK_CUDA_DEVICE; } /**** * The following serves for setting smaller maxGridSize so that we can force * the prefix sum in CUDA to run with more the one grids in unit tests. */ - static void setMaxGridSize( int newMaxGridSize ) { - maxGridSize = newMaxGridSize; + static int& maxGridSize() + { + static int maxGridSize = Devices::Cuda::getMaxGridSize(); + return maxGridSize; } - static void resetMaxGridSize() { - maxGridSize = Devices::Cuda::getMaxGridSize(); + static void resetMaxGridSize() + { + maxGridSize() = Devices::Cuda::getMaxGridSize(); } - static int maxGridSize; - - static int gridsCount; + static int& gridsCount() + { + static int gridsCount = -1; + return gridsCount; + } }; -template< PrefixSumType prefixSumType, - typename Real, - typename Index > -int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::maxGridSize = Devices::Cuda::getMaxGridSize(); - -template< PrefixSumType prefixSumType, - typename Real, - typename Index > -int CudaPrefixSumKernelLauncher< prefixSumType, Real, Index >::gridsCount = -1; - - #endif } // namespace Algorithms } // namespace Containers } // namespace TNL - - diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 89e13a272dcf2253d8430dc73778257e85737174..ce278c7f1b9d8393afa86d848a9dddb2f52fd744 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -12,20 +12,17 @@ #include // std::pair -#ifdef HAVE_CUDA -#include -#endif - #include #include #include #include +#include +#include namespace TNL { namespace Containers { namespace Algorithms { -#ifdef HAVE_CUDA /**** * The performance of this kernel is very sensitive to register usage. * Compile with --ptxas-options=-v and configure these constants for given @@ -34,6 +31,7 @@ namespace Algorithms { static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value +#ifdef HAVE_CUDA // __CUDA_ARCH__ is defined only in device code! #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 8; @@ -42,47 +40,42 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi #endif template< int blockSize, - typename Result, - typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index > + typename Result, + typename DataFetcher, + typename Reduction, + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, Result* output ) { - using IndexType = Index; - using ResultType = Result; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. - const IndexType tid = threadIdx.x; - IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; + const Index tid = threadIdx.x; + Index gid = blockIdx.x * blockDim. x + threadIdx.x; + const Index gridSize = blockDim.x * gridDim.x; sdata[ tid ] = zero; - // Read data into the shared memory. We start with the sequential reduction. + // Start with the sequential reduction and push the result into the shared memory. while( gid + 4 * gridSize < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); - reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); - reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 2 * gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + 3 * gridSize ) ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); - reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid + gridSize ) ); gid += 2 * gridSize; } while( gid < size ) { - reduction( sdata[ tid ], dataFetcher( gid ) ); + sdata[ tid ] = reduction( sdata[ tid ], dataFetcher( gid ) ); gid += gridSize; } __syncthreads(); @@ -90,49 +83,48 @@ CudaReductionKernel( const Result zero, // Perform the parallel reduction. if( blockSize >= 1024 ) { if( tid < 512 ) - reduction( sdata[ tid ], sdata[ tid + 512 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 512 ] ); __syncthreads(); } if( blockSize >= 512 ) { if( tid < 256 ) - reduction( sdata[ tid ], sdata[ tid + 256 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 256 ] ); __syncthreads(); } if( blockSize >= 256 ) { if( tid < 128 ) - reduction( sdata[ tid ], sdata[ tid + 128 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 128 ] ); __syncthreads(); } if( blockSize >= 128 ) { if( tid < 64 ) - reduction( sdata[ tid ], sdata[ tid + 64 ] ); + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 64 ] ); __syncthreads(); } - // This runs in one warp so it is synchronized implicitly. + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { - volatile ResultType* vsdata = sdata; - if( blockSize >= 64 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 32 ] ); - } + if( blockSize >= 64 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have - // twice as much shared memory, so we do not access out of bounds. The - // results for the upper half will be undefined, but unused anyway. - if( blockSize >= 32 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSize >= 16 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSize >= 8 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSize >= 4 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSize >= 2 ) { - volatileReduction( vsdata[ tid ], vsdata[ tid + 1 ] ); - } + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSize >= 32 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); + if( blockSize >= 16 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); + if( blockSize >= 8 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); + if( blockSize >= 4 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); + if( blockSize >= 2 ) + sdata[ tid ] = reduction( sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. @@ -141,34 +133,29 @@ CudaReductionKernel( const Result zero, } template< int blockSize, - typename Result, - typename DataFetcher, - typename Reduction, - typename VolatileReduction, - typename Index > + typename Result, + typename DataFetcher, + typename Reduction, + typename Index > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) CudaReductionWithArgumentKernel( const Result zero, const DataFetcher dataFetcher, const Reduction reduction, - const VolatileReduction volatileReduction, const Index size, Result* output, Index* idxOutput, const Index* idxInput = nullptr ) { - using IndexType = Index; - using ResultType = Result; - - ResultType* sdata = Devices::Cuda::getSharedMemory< ResultType >(); - IndexType* sidx = reinterpret_cast< IndexType* >( &sdata[ blockDim.x ] ); + Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); // Get the thread id (tid), global thread id (gid) and gridSize. - const IndexType tid = threadIdx.x; - IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; - const IndexType gridSize = blockDim.x * gridDim.x; + const Index tid = threadIdx.x; + Index gid = blockIdx.x * blockDim. x + threadIdx.x; + const Index gridSize = blockDim.x * gridDim.x; - // Read data into the shared memory. We start with the sequential reduction. + // Start with the sequential reduction and push the result into the shared memory. if( idxInput ) { if( gid < size ) { sdata[ tid ] = dataFetcher( gid ); @@ -243,31 +230,29 @@ CudaReductionWithArgumentKernel( const Result zero, __syncthreads(); } - // This runs in one warp so it is synchronized implicitly. + // This runs in one warp so we use __syncwarp() instead of __syncthreads(). if( tid < 32 ) { - volatile ResultType* vsdata = sdata; - volatile IndexType* vsidx = sidx; - if( blockSize >= 64 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 32 ], vsdata[ tid ], vsdata[ tid + 32 ] ); - } + if( blockSize >= 64 ) + reduction( sidx[ tid ], sidx[ tid + 32 ], sdata[ tid ], sdata[ tid + 32 ] ); + __syncwarp(); // Note that here we do not have to check if tid < 16 etc, because we have - // twice as much shared memory, so we do not access out of bounds. The - // results for the upper half will be undefined, but unused anyway. - if( blockSize >= 32 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 16 ], vsdata[ tid ], vsdata[ tid + 16 ] ); - } - if( blockSize >= 16 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 8 ], vsdata[ tid ], vsdata[ tid + 8 ] ); - } - if( blockSize >= 8 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 4 ], vsdata[ tid ], vsdata[ tid + 4 ] ); - } - if( blockSize >= 4 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 2 ], vsdata[ tid ], vsdata[ tid + 2 ] ); - } - if( blockSize >= 2 ) { - volatileReduction( vsidx[ tid ], vsidx[ tid + 1 ], vsdata[ tid ], vsdata[ tid + 1 ] ); - } + // 2 * blockSize.x elements of shared memory per block, so we do not + // access out of bounds. The results for the upper half will be undefined, + // but unused anyway. + if( blockSize >= 32 ) + reduction( sidx[ tid ], sidx[ tid + 16 ], sdata[ tid ], sdata[ tid + 16 ] ); + __syncwarp(); + if( blockSize >= 16 ) + reduction( sidx[ tid ], sidx[ tid + 8 ], sdata[ tid ], sdata[ tid + 8 ] ); + __syncwarp(); + if( blockSize >= 8 ) + reduction( sidx[ tid ], sidx[ tid + 4 ], sdata[ tid ], sdata[ tid + 4 ] ); + __syncwarp(); + if( blockSize >= 4 ) + reduction( sidx[ tid ], sidx[ tid + 2 ], sdata[ tid ], sdata[ tid + 2 ] ); + __syncwarp(); + if( blockSize >= 2 ) + reduction( sidx[ tid ], sidx[ tid + 1 ], sdata[ tid ], sdata[ tid + 1 ] ); } // Store the result back in the global memory. @@ -276,15 +261,13 @@ CudaReductionWithArgumentKernel( const Result zero, idxOutput[ blockIdx.x ] = sidx[ 0 ]; } } +#endif template< typename Index, typename Result > struct CudaReductionKernelLauncher { - using IndexType = Index; - using ResultType = Result; - // The number of blocks should be a multiple of the number of multiprocessors // to ensure optimum balancing of the load. This is very important, because // we run the kernel with a fixed number of blocks, so the amount of work per @@ -309,62 +292,56 @@ struct CudaReductionKernelLauncher } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int start( const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, - ResultType*& output ) + Result*& output ) { // create reference to the reduction buffer singleton and set size - const std::size_t buf_size = 2 * desGridSize * sizeof( ResultType ); + const std::size_t buf_size = 2 * desGridSize * sizeof( Result ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); + output = cudaReductionBuffer.template getData< Result >(); - this->reducedSize = this->launch( originalSize, reduction, volatileReduction, dataFetcher, zero, output ); + this->reducedSize = this->launch( originalSize, reduction, dataFetcher, zero, output ); return this->reducedSize; } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int startWithArgument( const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, - ResultType*& output, - IndexType*& idxOutput ) + Result*& output, + Index*& idxOutput ) { // create reference to the reduction buffer singleton and set size - const std::size_t buf_size = 2 * desGridSize * ( sizeof( ResultType ) + sizeof( IndexType ) ); + const std::size_t buf_size = 2 * desGridSize * ( sizeof( Result ) + sizeof( Index ) ); CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); cudaReductionBuffer.setSize( buf_size ); - output = cudaReductionBuffer.template getData< ResultType >(); - idxOutput = reinterpret_cast< IndexType* >( &output[ 2 * desGridSize ] ); + output = cudaReductionBuffer.template getData< Result >(); + idxOutput = reinterpret_cast< Index* >( &output[ 2 * desGridSize ] ); - this->reducedSize = this->launchWithArgument( originalSize, reduction, volatileReduction, dataFetcher, zero, output, idxOutput, nullptr ); + this->reducedSize = this->launchWithArgument( originalSize, reduction, dataFetcher, zero, output, idxOutput, nullptr ); return this->reducedSize; } - template< typename Reduction, - typename VolatileReduction > + template< typename Reduction > Result finish( const Reduction& reduction, - const VolatileReduction& volatileReduction, const Result& zero ) { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - ResultType* input = cudaReductionBuffer.template getData< ResultType >(); - ResultType* output = &input[ desGridSize ]; + Result* input = cudaReductionBuffer.template getData< Result >(); + Result* output = &input[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes - auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; - this->reducedSize = this->launch( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output ); + auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; + this->reducedSize = this->launch( this->reducedSize, reduction, copyFetch, zero, output ); std::swap( input, output ); } @@ -373,30 +350,28 @@ struct CudaReductionKernelLauncher std::swap( input, output ); // Copy result on CPU - ResultType result; + Result result; ArrayOperations< Devices::Host, Devices::Cuda >::copy( &result, output, 1 ); return result; } - template< typename Reduction, - typename VolatileReduction > + template< typename Reduction > std::pair< Index, Result > finishWithArgument( const Reduction& reduction, - const VolatileReduction& volatileReduction, const Result& zero ) { // Input is the first half of the buffer, output is the second half CudaReductionBuffer& cudaReductionBuffer = CudaReductionBuffer::getInstance(); - ResultType* input = cudaReductionBuffer.template getData< ResultType >(); - ResultType* output = &input[ desGridSize ]; - IndexType* idxInput = reinterpret_cast< IndexType* >( &output[ desGridSize ] ); - IndexType* idxOutput = &idxInput[ desGridSize ]; + Result* input = cudaReductionBuffer.template getData< Result >(); + Result* output = &input[ desGridSize ]; + Index* idxInput = reinterpret_cast< Index* >( &output[ desGridSize ] ); + Index* idxOutput = &idxInput[ desGridSize ]; while( this->reducedSize > 1 ) { // this lambda has to be defined inside the loop, because the captured variable changes - auto copyFetch = [input] __cuda_callable__ ( IndexType i ) { return input[ i ]; }; - this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, volatileReduction, copyFetch, zero, output, idxOutput, idxInput ); + auto copyFetch = [input] __cuda_callable__ ( Index i ) { return input[ i ]; }; + this->reducedSize = this->launchWithArgument( this->reducedSize, reduction, copyFetch, zero, output, idxOutput, idxInput ); std::swap( input, output ); std::swap( idxInput, idxOutput ); } @@ -417,217 +392,220 @@ struct CudaReductionKernelLauncher protected: template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int launch( const Index size, const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output ) { +#ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * sizeof( ResultType ) - : blockSize.x * sizeof( ResultType ); + const Index shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * sizeof( Result ) + : blockSize.x * sizeof( Result ); - // This is "general", but this method always sets blockSize.x to a specific value, - // so runtime switch is not necessary - it only prolongs the compilation time. + // This is "general", but this method always sets blockSize.x to a specific value, + // so runtime switch is not necessary - it only prolongs the compilation time. /* - // Depending on the blockSize we generate appropriate template instance. - switch( blockSize.x ) - { - case 512: - CudaReductionKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - TNL_CHECK_CUDA_DEVICE; + // Depending on the blockSize we generate appropriate template instance. + switch( blockSize.x ) + { + case 512: + CudaReductionKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; */ - // Check just to future-proof the code setting blockSize.x - if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + // Check just to future-proof the code setting blockSize.x + if( blockSize.x == Reduction_maxThreadsPerBlock ) { + cudaFuncSetCacheConfig(CudaReductionKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); - CudaReductionKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output); - } - else { - TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); - } + CudaReductionKernel< Reduction_maxThreadsPerBlock > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output); + } + else { + TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); + } - // Return the size of the output array on the CUDA device - return gridSize.x; + // Return the size of the output array on the CUDA device + return gridSize.x; +#else + throw Exceptions::CudaSupportMissing(); +#endif } template< typename DataFetcher, - typename Reduction, - typename VolatileReduction > + typename Reduction > int launchWithArgument( const Index size, const Reduction& reduction, - const VolatileReduction& volatileReduction, const DataFetcher& dataFetcher, const Result& zero, Result* output, Index* idxOutput, const Index* idxInput ) { +#ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds - const IndexType shmem = (blockSize.x <= 32) - ? 2 * blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ) - : blockSize.x * ( sizeof( ResultType ) + sizeof( Index ) ); + const Index shmem = (blockSize.x <= 32) + ? 2 * blockSize.x * ( sizeof( Result ) + sizeof( Index ) ) + : blockSize.x * ( sizeof( Result ) + sizeof( Index ) ); - // This is "general", but this method always sets blockSize.x to a specific value, - // so runtime switch is not necessary - it only prolongs the compilation time. + // This is "general", but this method always sets blockSize.x to a specific value, + // so runtime switch is not necessary - it only prolongs the compilation time. /* - // Depending on the blockSize we generate appropriate template instance. - switch( blockSize.x ) - { - case 512: - CudaReductionWithArgumentKernel< 512 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 256: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 256 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 128: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 128 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 64: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 64 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 32: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 32 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 16: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 16 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 8: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 8 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 4: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 4 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 2: - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); - - CudaReductionWithArgumentKernel< 2 > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - break; - case 1: - TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); - default: - TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); - } - TNL_CHECK_CUDA_DEVICE; + // Depending on the blockSize we generate appropriate template instance. + switch( blockSize.x ) + { + case 512: + CudaReductionWithArgumentKernel< 512 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 256: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 256, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 256 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 128: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 128, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 128 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 64: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 64, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 64 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 32: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 32, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 32 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 16: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 16, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 16 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 8: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 8, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 8 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 4: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 4, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 4 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 2: + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< 2, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); + + CudaReductionWithArgumentKernel< 2 > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + break; + case 1: + TNL_ASSERT( false, std::cerr << "blockSize should not be 1." << std::endl ); + default: + TNL_ASSERT( false, std::cerr << "Block size is " << blockSize. x << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." ); + } + TNL_CHECK_CUDA_DEVICE; */ - // Check just to future-proof the code setting blockSize.x - if( blockSize.x == Reduction_maxThreadsPerBlock ) { - cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, VolatileReduction, Index >, cudaFuncCachePreferShared); + // Check just to future-proof the code setting blockSize.x + if( blockSize.x == Reduction_maxThreadsPerBlock ) { + cudaFuncSetCacheConfig(CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock, Result, DataFetcher, Reduction, Index >, cudaFuncCachePreferShared); - CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > - <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, volatileReduction, size, output, idxOutput, idxInput ); - } - else { - TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); - } + CudaReductionWithArgumentKernel< Reduction_maxThreadsPerBlock > + <<< gridSize, blockSize, shmem >>>( zero, dataFetcher, reduction, size, output, idxOutput, idxInput ); + } + else { + TNL_ASSERT( false, std::cerr << "Block size was expected to be " << Reduction_maxThreadsPerBlock << ", but " << blockSize.x << " was specified." << std::endl; ); + } - // return the size of the output array on the CUDA device - return gridSize.x; + // return the size of the output array on the CUDA device + return gridSize.x; +#else + throw Exceptions::CudaSupportMissing(); +#endif } const int activeDevice; const int blocksdPerMultiprocessor; const int desGridSize; - const IndexType originalSize; - IndexType reducedSize; + const Index originalSize; + Index reducedSize; }; -#endif } // namespace Algorithms } // namespace Containers diff --git a/src/TNL/Containers/Algorithms/DistributedPrefixSum.h b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h new file mode 100644 index 0000000000000000000000000000000000000000..b81e2ac94d7959a417582de613e4a8793040c950 --- /dev/null +++ b/src/TNL/Containers/Algorithms/DistributedPrefixSum.h @@ -0,0 +1,70 @@ +/*************************************************************************** + PrefixSum.h - description + ------------------- + begin : Aug 16, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include +#include + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< PrefixSumType Type > +struct DistributedPrefixSum +{ + template< typename DistributedVector, + typename Reduction > + static void + perform( DistributedVector& v, + typename DistributedVector::IndexType begin, + typename DistributedVector::IndexType end, + const Reduction& reduction, + const typename DistributedVector::RealType zero ) + { + using RealType = typename DistributedVector::RealType; + using DeviceType = typename DistributedVector::DeviceType; + using CommunicatorType = typename DistributedVector::CommunicatorType; + + const auto group = v.getCommunicationGroup(); + if( group != CommunicatorType::NullGroup ) { + // adjust begin and end for the local range + const auto localRange = v.getLocalRange(); + begin = min( max( begin, localRange.getBegin() ), localRange.getEnd() ) - localRange.getBegin(); + end = max( min( end, localRange.getEnd() ), localRange.getBegin() ) - localRange.getBegin(); + + // perform first phase on the local data + auto localView = v.getLocalView(); + const auto blockShifts = PrefixSum< DeviceType, Type >::performFirstPhase( localView, begin, end, reduction, zero ); + const RealType localSum = blockShifts.getElement( blockShifts.getSize() - 1 ); + + // exchange local sums between ranks + const int nproc = CommunicatorType::GetSize( group ); + RealType dataForScatter[ nproc ]; + for( int i = 0; i < nproc; i++ ) dataForScatter[ i ] = localSum; + Vector< RealType, Devices::Host > rankSums( nproc ); + // NOTE: exchanging general data types does not work with MPI + CommunicatorType::Alltoall( dataForScatter, 1, rankSums.getData(), 1, group ); + + // compute prefix-sum of the per-rank sums + PrefixSum< Devices::Host, PrefixSumType::Exclusive >::perform( rankSums, 0, nproc, reduction, zero ); + + // perform second phase: shift by the per-block and per-rank offsets + const int rank = CommunicatorType::GetRank( group ); + PrefixSum< DeviceType, Type >::performSecondPhase( localView, blockShifts, begin, end, reduction, rankSums[ rank ] ); + } + } +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Multireduction.h b/src/TNL/Containers/Algorithms/Multireduction.h index 42b8bf28d16842f9d28a624f8995f2ffe959d943..9802a295356db6e53c7f9c3d809c3dc9c70b38c8 100644 --- a/src/TNL/Containers/Algorithms/Multireduction.h +++ b/src/TNL/Containers/Algorithms/Multireduction.h @@ -12,66 +12,78 @@ #pragma once +#include // reduction functions like std::plus, std::logical_and, std::logical_or etc. + #include #include -#include namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > -class Multireduction -{ -}; +struct Multireduction; template<> -class Multireduction< Devices::Cuda > +struct Multireduction< Devices::Host > { -public: - template< typename Operation, typename Index > + /** + * Parameters: + * zero: starting value for reduction + * dataFetcher: callable object such that `dataFetcher( i, j )` yields + * the i-th value to be reduced from the j-th dataset + * (i = 0,...,size-1; j = 0,...,n-1) + * reduction: callable object representing the reduction operation + * for example, it can be an instance of std::plus, std::logical_and, + * std::logical_or etc. + * size: the size of each dataset + * n: number of datasets to be reduced + * result: output array of size = n + */ + template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > static void - reduce( Operation& operation, - const int n, + reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); -}; - -template<> -class Multireduction< Devices::Host > -{ -public: - template< typename Operation, typename Index > - static void - reduce( Operation& operation, const int n, - const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); + Result* result ); }; template<> -class Multireduction< Devices::MIC > +struct Multireduction< Devices::Cuda > { -public: - template< typename Operation, typename Index > + /** + * Parameters: + * zero: starting value for reduction + * dataFetcher: callable object such that `dataFetcher( i, j )` yields + * the i-th value to be reduced from the j-th dataset + * (i = 0,...,size-1; j = 0,...,n-1) + * reduction: callable object representing the reduction operation + * for example, it can be an instance of std::plus, std::logical_and, + * std::logical_or etc. + * size: the size of each dataset + * n: number of datasets to be reduced + * hostResult: output array of size = n + */ + template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > static void - reduce( Operation& operation, - const int n, + reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ); + const int n, + Result* hostResult ); }; } // namespace Algorithms } // namespace Containers } // namespace TNL -#include "Multireduction_impl.h" +#include "Multireduction.hpp" diff --git a/src/TNL/Containers/Algorithms/Multireduction.hpp b/src/TNL/Containers/Algorithms/Multireduction.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f4407e6207ba4d67c05ba7f434e2629cfd3c592f --- /dev/null +++ b/src/TNL/Containers/Algorithms/Multireduction.hpp @@ -0,0 +1,229 @@ +/*************************************************************************** + Multireduction_impl.h - description + ------------------- + begin : May 13, 2016 + copyright : (C) 2016 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +// Implemented by: Jakub Klinkovsky + +#pragma once + +#include // std::unique_ptr + +//#define CUDA_REDUCTION_PROFILING + +#include +#include +#include +#include + +#ifdef CUDA_REDUCTION_PROFILING +#include +#include +#endif + +namespace TNL { +namespace Containers { +namespace Algorithms { + +template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > +void +Multireduction< Devices::Host >:: +reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const Index size, + const int n, + Result* result ) +{ + TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); + TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); + + constexpr int block_size = 128; + const int blocks = size / block_size; + +#ifdef HAVE_OPENMP + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) + { + // first thread initializes the result array + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + } + + // initialize array for thread-local results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; + + #pragma omp for nowait + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } + } + } + + // the first thread that reaches here processes the last, incomplete block + #pragma omp single nowait + { + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + } + + // local reduction of unrolled results + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); + } + + // inter-thread reduction of local results + #pragma omp critical + { + for( int k = 0; k < n; k++ ) + result[ k ] = reduction( result[ k ], r[ 4 * k ] ); + } + } + } + else { +#endif + if( blocks > 1 ) { + // initialize array for unrolled results + // (it is accessed as a row-major matrix with n rows and 4 columns) + Result r[ n * 4 ]; + for( int k = 0; k < n * 4; k++ ) + r[ k ] = zero; + + // main reduction (explicitly unrolled loop) + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( int i = 0; i < block_size; i += 4 ) { + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( offset + i, k ) ); + _r[ 1 ] = reduction( _r[ 1 ], dataFetcher( offset + i + 1, k ) ); + _r[ 2 ] = reduction( _r[ 2 ], dataFetcher( offset + i + 2, k ) ); + _r[ 3 ] = reduction( _r[ 3 ], dataFetcher( offset + i + 3, k ) ); + } + } + } + + // reduction of the last, incomplete block (not unrolled) + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + for( Index i = blocks * block_size; i < size; i++ ) + _r[ 0 ] = reduction( _r[ 0 ], dataFetcher( i, k ) ); + } + + // reduction of unrolled results + for( int k = 0; k < n; k++ ) { + Result* _r = r + 4 * k; + _r[ 0 ] = reduction( _r[ 0 ], _r[ 1 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 2 ] ); + _r[ 0 ] = reduction( _r[ 0 ], _r[ 3 ] ); + + // copy the result into the output parameter + result[ k ] = _r[ 0 ]; + } + } + else { + for( int k = 0; k < n; k++ ) + result[ k ] = zero; + + for( int b = 0; b < blocks; b++ ) { + const Index offset = b * block_size; + for( int k = 0; k < n; k++ ) { + for( int i = 0; i < block_size; i++ ) + result[ k ] = reduction( result[ k ], dataFetcher( offset + i, k ) ); + } + } + + for( int k = 0; k < n; k++ ) { + for( Index i = blocks * block_size; i < size; i++ ) + result[ k ] = reduction( result[ k ], dataFetcher( i, k ) ); + } + } +#ifdef HAVE_OPENMP + } +#endif +} + +template< typename Result, + typename DataFetcher, + typename Reduction, + typename Index > +void +Multireduction< Devices::Cuda >:: +reduce( const Result zero, + DataFetcher dataFetcher, + const Reduction reduction, + const Index size, + const int n, + Result* hostResult ) +{ + TNL_ASSERT_GT( size, 0, "The size of datasets must be positive." ); + TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); + + #ifdef CUDA_REDUCTION_PROFILING + Timer timer; + timer.reset(); + timer.start(); + #endif + + // start the reduction on the GPU + Result* deviceAux1 = nullptr; + const int reducedSize = CudaMultireductionKernelLauncher( zero, dataFetcher, reduction, size, n, deviceAux1 ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ new Result[ n * reducedSize ] }; + ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, n * reducedSize ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; + timer.reset(); + timer.start(); + #endif + + // finish the reduction on the host + auto dataFetcherFinish = [&] ( int i, int k ) { return resultArray[ i + k * reducedSize ]; }; + Multireduction< Devices::Host >::reduce( zero, dataFetcherFinish, reduction, reducedSize, n, hostResult ); + + #ifdef CUDA_REDUCTION_PROFILING + timer.stop(); + std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; + #endif +}; + +} // namespace Algorithms +} // namespace Containers +} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Multireduction_impl.h b/src/TNL/Containers/Algorithms/Multireduction_impl.h deleted file mode 100644 index ebd0ad2562a4795fcea6abeb9287b5063b92c408..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/Multireduction_impl.h +++ /dev/null @@ -1,327 +0,0 @@ -/*************************************************************************** - Multireduction_impl.h - description - ------------------- - begin : May 13, 2016 - copyright : (C) 2016 by Tomas Oberhuber et al. - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -// Implemented by: Jakub Klinkovsky - -#pragma once - -#include "Multireduction.h" - -//#define CUDA_REDUCTION_PROFILING - -#include -#include -#include -#include -#include -#include - -#ifdef CUDA_REDUCTION_PROFILING -#include -#include -#endif - -namespace TNL { -namespace Containers { -namespace Algorithms { - -/**** - * Arrays smaller than the following constant are reduced on CPU. - */ -//static constexpr int Multireduction_minGpuDataSize = 16384;//65536; //16384;//1024;//256; -// TODO: benchmarks with different values -static constexpr int Multireduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; - -/* - * Parameters: - * operation: the operation used for reduction - * n: number of datasets to be reduced - * size: the size of each dataset - * deviceInput1: input array of size = n * ldInput1 - * ldInput1: leading dimension of the deviceInput1 array - * deviceInput2: either nullptr or input array of size = size - * hostResult: output array of size = n - */ -template< typename Operation, typename Index > -void -Multireduction< Devices::Cuda >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* deviceInput1, - const Index ldInput1, - const typename Operation::DataType2* deviceInput2, - typename Operation::ResultType* hostResult ) -{ -#ifdef HAVE_CUDA - TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); - TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." ); - - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - typedef typename Operation::LaterReductionOperation LaterReductionOperation; - - /*** - * First check if the input array(s) is/are large enough for the multireduction on GPU. - * Otherwise copy it/them to host and multireduce on CPU. - */ - if( n * ldInput1 < Multireduction_minGpuDataSize ) { - DataType1 hostArray1[ Multireduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray1, deviceInput1, n * ldInput1 ); - if( deviceInput2 ) { - using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; - _DT2 hostArray2[ Multireduction_minGpuDataSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( hostArray2, (_DT2*) deviceInput2, size ); - Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, hostArray2, hostResult ); - } - else { - Multireduction< Devices::Host >::reduce( operation, n, size, hostArray1, ldInput1, (DataType2*) nullptr, hostResult ); - } - return; - } - - #ifdef CUDA_REDUCTION_PROFILING - Timer timer; - timer.reset(); - timer.start(); - #endif - - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1 = nullptr; - const IndexType reducedSize = CudaMultireductionKernelLauncher( operation, - n, - size, - deviceInput1, - ldInput1, - deviceInput2, - deviceAux1 ); - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Multireduction of " << n << " datasets on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; - timer.reset(); - timer.start(); - #endif - - /*** - * Transfer the reduced data from device to host. - */ - ResultType resultArray[ n * reducedSize ]; - ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray, deviceAux1, n * reducedSize ); - - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Transferring data to CPU took " << timer.getRealTime() << " sec. " << std::endl; - timer.reset(); - timer.start(); - #endif - -// std::cout << "resultArray = ["; -// for( int i = 0; i < n * reducedSize; i++ ) { -// std::cout << resultArray[ i ]; -// if( i < n * reducedSize - 1 ) -// std::cout << ", "; -// } -// std::cout << "]" << std::endl; - - /*** - * Reduce the data on the host system. - */ - LaterReductionOperation laterReductionOperation; - Multireduction< Devices::Host >::reduce( laterReductionOperation, n, reducedSize, resultArray, reducedSize, (void*) nullptr, hostResult ); - - #ifdef CUDA_REDUCTION_PROFILING - timer.stop(); - std::cout << " Multireduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; - #endif - - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif -}; - -/* - * Parameters: - * operation: the operation used for reduction - * n: number of datasets to be reduced - * size: the size of each dataset - * input1: input array of size = n * ldInput1 - * ldInput1: leading dimension of the input1 array - * input2: either nullptr or input array of size = size - * hostResult: output array of size = n - */ -template< typename Operation, typename Index > -void -Multireduction< Devices::Host >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* result ) -{ - TNL_ASSERT_GT( n, 0, "The number of datasets must be positive." ); - TNL_ASSERT_LE( size, ldInput1, "The size of the input cannot exceed its leading dimension." ); - - typedef Index IndexType; - typedef typename Operation::DataType1 DataType1; - typedef typename Operation::DataType2 DataType2; - typedef typename Operation::ResultType ResultType; - - constexpr int block_size = 128; - const int blocks = size / block_size; - -#ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && blocks >= 2 ) -#pragma omp parallel - { - // first thread initializes the result array - #pragma omp single nowait - { - for( int k = 0; k < n; k++ ) - result[ k ] = operation.initialValue(); - } - - // initialize array for thread-local results - // (it is accessed as a row-major matrix with n rows and 4 columns) - ResultType r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = operation.initialValue(); - - #pragma omp for nowait - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); - operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); - operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); - operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); - } - } - } - - // the first thread that reaches here processes the last, incomplete block - #pragma omp single nowait - { - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( _r[ 0 ], i, _input1, input2 ); - } - } - - // local reduction of unrolled results - for( int k = 0; k < n; k++ ) { - ResultType* _r = r + 4 * k; - operation.commonReduction( _r[ 0 ], _r[ 1 ] ); - operation.commonReduction( _r[ 0 ], _r[ 2 ] ); - operation.commonReduction( _r[ 0 ], _r[ 3 ] ); - } - - // inter-thread reduction of local results - #pragma omp critical - { - for( int k = 0; k < n; k++ ) - operation.commonReduction( result[ k ], r[ 4 * k ] ); - } - } - else { -#endif - if( blocks > 1 ) { - // initialize array for unrolled results - // (it is accessed as a row-major matrix with n rows and 4 columns) - ResultType r[ n * 4 ]; - for( int k = 0; k < n * 4; k++ ) - r[ k ] = operation.initialValue(); - - // main reduction (explicitly unrolled loop) - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( int i = 0; i < block_size; i += 4 ) { - operation.firstReduction( _r[ 0 ], offset + i, _input1, input2 ); - operation.firstReduction( _r[ 1 ], offset + i + 1, _input1, input2 ); - operation.firstReduction( _r[ 2 ], offset + i + 2, _input1, input2 ); - operation.firstReduction( _r[ 3 ], offset + i + 3, _input1, input2 ); - } - } - } - - // reduction of the last, incomplete block (not unrolled) - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - ResultType* _r = r + 4 * k; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( _r[ 0 ], i, _input1, input2 ); - } - - // reduction of unrolled results - for( int k = 0; k < n; k++ ) { - ResultType* _r = r + 4 * k; - operation.commonReduction( _r[ 0 ], _r[ 1 ] ); - operation.commonReduction( _r[ 0 ], _r[ 2 ] ); - operation.commonReduction( _r[ 0 ], _r[ 3 ] ); - - // copy the result into the output parameter - result[ k ] = _r[ 0 ]; - } - } - else { - for( int k = 0; k < n; k++ ) - result[ k ] = operation.initialValue(); - - for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - for( int i = 0; i < block_size; i++ ) - operation.firstReduction( result[ k ], offset + i, _input1, input2 ); - } - } - - for( int k = 0; k < n; k++ ) { - const DataType1* _input1 = input1 + k * ldInput1; - for( IndexType i = blocks * block_size; i < size; i++ ) - operation.firstReduction( result[ k ], i, _input1, input2 ); - } - } -#ifdef HAVE_OPENMP - } -#endif -} - -template< typename Operation, typename Index > -void -Multireduction< Devices::MIC >:: -reduce( Operation& operation, - const int n, - const Index size, - const typename Operation::DataType1* input1, - const Index ldInput1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* result ) -{ - throw Exceptions::NotImplementedError("Multireduction is not implemented for MIC."); -} - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/PrefixSum.h b/src/TNL/Containers/Algorithms/PrefixSum.h index 715d6f1b982c53be5d18b9c3216af955aa8430b8..2b0e40458089ebfe76f1573f07891983103c848d 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.h +++ b/src/TNL/Containers/Algorithms/PrefixSum.h @@ -14,92 +14,121 @@ #include #include -#include -#include namespace TNL { namespace Containers { namespace Algorithms { +enum class PrefixSumType { + Exclusive, + Inclusive +}; + template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class PrefixSum {}; +struct PrefixSum; template< typename Device, PrefixSumType Type = PrefixSumType::Inclusive > -class SegmentedPrefixSum {}; +struct SegmentedPrefixSum; template< PrefixSumType Type > -class PrefixSum< Devices::Host, Type > +struct PrefixSum< Devices::Host, Type > { - public: - template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > - static void - perform( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction > + static void + perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename Reduction > + static auto + performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename BlockShifts, + typename Reduction > + static void + performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ); }; template< PrefixSumType Type > -class PrefixSum< Devices::Cuda, Type > +struct PrefixSum< Devices::Cuda, Type > { - public: - template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > - static void - perform( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction > + static void + perform( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename Reduction > + static auto + performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); + + template< typename Vector, + typename BlockShifts, + typename Reduction > + static void + performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ); }; template< PrefixSumType Type > -class SegmentedPrefixSum< Devices::Host, Type > +struct SegmentedPrefixSum< Devices::Host, Type > { - public: - template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, - typename Flags > - static void - perform( Vector& v, - Flags& flags, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction, + typename Flags > + static void + perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); }; template< PrefixSumType Type > -class SegmentedPrefixSum< Devices::Cuda, Type > +struct SegmentedPrefixSum< Devices::Cuda, Type > { - public: - template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, - typename Flags > - static void - perform( Vector& v, - Flags& flags, - const typename Vector::IndexType begin, - const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ); + template< typename Vector, + typename Reduction, + typename Flags > + static void + perform( Vector& v, + Flags& flags, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ); }; - - } // namespace Algorithms } // namespace Containers } // namespace TNL diff --git a/src/TNL/Containers/Algorithms/PrefixSum.hpp b/src/TNL/Containers/Algorithms/PrefixSum.hpp index d3c3b3071928e89a284bf1db4acea1e8c30ccf3e..8af19d09a5ae3a741c8cd9a612c1dad60bdb694e 100644 --- a/src/TNL/Containers/Algorithms/PrefixSum.hpp +++ b/src/TNL/Containers/Algorithms/PrefixSum.hpp @@ -14,103 +14,228 @@ #include "PrefixSum.h" -//#define CUDA_REDUCTION_PROFILING - #include -#include -#include -#include +#include #include +#include #include -#ifdef CUDA_REDUCTION_PROFILING -#include -#include -#endif - namespace TNL { namespace Containers { namespace Algorithms { -/**** - * Arrays smaller than the following constant - * are reduced on CPU. The constant must not be larger - * than maximal CUDA grid size. - */ -static constexpr int PrefixSum_minGpuDataSize = 256;//65536; //16384;//1024;//256; - -//// -// PrefixSum on host template< PrefixSumType Type > -template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + template< typename Vector, + typename Reduction > void PrefixSum< Devices::Host, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ) + const Reduction& reduction, + const typename Vector::RealType zero ) +{ +#ifdef HAVE_OPENMP + const auto blockShifts = performFirstPhase( v, begin, end, reduction, zero ); + performSecondPhase( v, blockShifts, begin, end, reduction, zero ); +#else + // sequential prefix-sum does not need a second phase + performFirstPhase( v, begin, end, reduction, zero ); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename Reduction > +auto +PrefixSum< Devices::Host, Type >:: +performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) { using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - // TODO: parallelize with OpenMP - if( Type == PrefixSumType::Inclusive ) +#ifdef HAVE_OPENMP + const int threads = Devices::Host::getMaxThreadsCount(); + Array< RealType, Devices::Host > block_sums( threads + 1 ); + block_sums[ 0 ] = zero; + + #pragma omp parallel num_threads(threads) + { + // init + const int thread_idx = omp_get_thread_num(); + RealType block_sum = zero; + + // perform prefix-sum on blocks statically assigned to threads + if( Type == PrefixSumType::Inclusive ) { + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) { + block_sum = reduction( block_sum, v[ i ] ); + v[ i ] = block_sum; + } + } + else { + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) { + const RealType x = v[ i ]; + v[ i ] = block_sum; + block_sum = reduction( block_sum, x ); + } + } + + // write the block sums into the buffer + block_sums[ thread_idx + 1 ] = block_sum; + } + + // block_sums now contains sums of numbers in each block. The first phase + // ends by computing prefix-sum of this array. + for( int i = 1; i < threads + 1; i++ ) + block_sums[ i ] = reduction( block_sums[ i ], block_sums[ i - 1 ] ); + + // block_sums now contains shift values for each block - to be used in the second phase + return block_sums; +#else + if( Type == PrefixSumType::Inclusive ) { for( IndexType i = begin + 1; i < end; i++ ) - reduction( v[ i ], v[ i - 1 ] ); + v[ i ] = reduction( v[ i ], v[ i - 1 ] ); + } else // Exclusive prefix sum { - RealType aux( v[ begin ] ); - v[ begin ] = zero; - for( IndexType i = begin + 1; i < end; i++ ) - { - RealType x = v[ i ]; + RealType aux = zero; + for( IndexType i = begin; i < end; i++ ) { + const RealType x = v[ i ]; v[ i ] = aux; - reduction( aux, x ); + aux = reduction( aux, x ); } } + + return 0; +#endif } -//// -// PrefixSum on CUDA device template< PrefixSumType Type > -template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation > + template< typename Vector, + typename BlockShifts, + typename Reduction > +void +PrefixSum< Devices::Host, Type >:: +performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ) +{ + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + +#ifdef HAVE_OPENMP + const int threads = blockShifts.getSize() - 1; + + // launch exactly the same number of threads as in the first phase + #pragma omp parallel num_threads(threads) + { + const int thread_idx = omp_get_thread_num(); + const RealType offset = reduction( blockShifts[ thread_idx ], shift ); + + // shift intermediate results by the offset + #pragma omp for schedule(static) + for( IndexType i = begin; i < end; i++ ) + v[ i ] = reduction( v[ i ], offset ); + } +#else + for( IndexType i = begin; i < end; i++ ) + v[ i ] = reduction( v[ i ], shift ); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename Reduction > void PrefixSum< Devices::Cuda, Type >:: perform( Vector& v, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatileReduction, - const typename Vector::RealType& zero ) + const Reduction& reduction, + const typename Vector::RealType zero ) { +#ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - using IndexType = typename Vector::IndexType; + + CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::perform( + end - begin, + &v[ begin ], // input + &v[ begin ], // output + reduction, + zero ); +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename Reduction > +auto +PrefixSum< Devices::Cuda, Type >:: +performFirstPhase( Vector& v, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType zero ) +{ #ifdef HAVE_CUDA - CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( - ( IndexType ) ( end - begin ), - ( IndexType ) 256, - &v[ begin ], - &v[ begin ], + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + return CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performFirstPhase( + end - begin, + &v[ begin ], // input + &v[ begin ], // output reduction, - volatileReduction, zero ); +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< PrefixSumType Type > + template< typename Vector, + typename BlockShifts, + typename Reduction > +void +PrefixSum< Devices::Cuda, Type >:: +performSecondPhase( Vector& v, + const BlockShifts& blockShifts, + const typename Vector::IndexType begin, + const typename Vector::IndexType end, + const Reduction& reduction, + const typename Vector::RealType shift ) +{ +#ifdef HAVE_CUDA + using RealType = typename Vector::RealType; + using IndexType = typename Vector::IndexType; + + CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::performSecondPhase( + end - begin, + &v[ begin ], // output + blockShifts.getData(), + reduction, + shift ); +#else + throw Exceptions::CudaSupportMissing(); #endif } -//// -// PrefixSum on host template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Host, Type >:: @@ -118,9 +243,8 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatilePrefixSum, - const typename Vector::RealType& zero ) + const Reduction& reduction, + const typename Vector::RealType zero ) { using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; @@ -130,7 +254,7 @@ perform( Vector& v, { for( IndexType i = begin + 1; i < end; i++ ) if( ! flags[ i ] ) - reduction( v[ i ], v[ i - 1 ] ); + v[ i ] = reduction( v[ i ], v[ i - 1 ] ); } else // Exclusive prefix sum { @@ -142,17 +266,14 @@ perform( Vector& v, if( flags[ i ] ) aux = zero; v[ i ] = aux; - reduction( aux, x ); + aux = reduction( aux, x ); } } } -//// -// PrefixSum on CUDA device template< PrefixSumType Type > template< typename Vector, - typename PrefixSumOperation, - typename VolatilePrefixSumOperation, + typename Reduction, typename Flags > void SegmentedPrefixSum< Devices::Cuda, Type >:: @@ -160,23 +281,16 @@ perform( Vector& v, Flags& flags, const typename Vector::IndexType begin, const typename Vector::IndexType end, - PrefixSumOperation& reduction, - VolatilePrefixSumOperation& volatileReduction, - const typename Vector::RealType& zero ) + const Reduction& reduction, + const typename Vector::RealType zero ) { +#ifdef HAVE_CUDA using RealType = typename Vector::RealType; using IndexType = typename Vector::IndexType; - using IndexType = typename Vector::IndexType; -#ifdef HAVE_CUDA - throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); // NOT IMPLEMENTED YET - /*CudaPrefixSumKernelLauncher< Type, RealType, IndexType >::start( - ( IndexType ) ( end - begin ), - ( IndexType ) 256, - &v[ begin ], - &v[ begin ], - reduction, - volatileReduction, - zero );*/ + + throw Exceptions::NotImplementedError( "Segmented prefix sum is not implemented for CUDA." ); +#else + throw Exceptions::CudaSupportMissing(); #endif } diff --git a/src/TNL/Containers/Algorithms/PrefixSumType.h b/src/TNL/Containers/Algorithms/PrefixSumType.h deleted file mode 100644 index ba03adfe6bafc66be96c96172aa97572727ee80f..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/PrefixSumType.h +++ /dev/null @@ -1,24 +0,0 @@ -/*************************************************************************** - PrefixSumType.h - description - ------------------- - begin : Jun 6, 2019 - copyright : (C) 2019 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -namespace TNL { -namespace Containers { -namespace Algorithms { - -enum class PrefixSumType { - Exclusive, - Inclusive -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/Reduction.h b/src/TNL/Containers/Algorithms/Reduction.h index e2c6d5295606bef5c1e1bf71f988b3a220fe39b2..41b000221bb5e7af34828694fbc752607f4ca0c4 100644 --- a/src/TNL/Containers/Algorithms/Reduction.h +++ b/src/TNL/Containers/Algorithms/Reduction.h @@ -13,103 +13,64 @@ #pragma once #include // std::pair +#include // reduction functions like std::plus, std::logical_and, std::logical_or etc. #include #include -#include namespace TNL { namespace Containers { namespace Algorithms { template< typename Device > -class Reduction; +struct Reduction; template<> -class Reduction< Devices::Host > +struct Reduction< Devices::Host > { - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static std::pair< Index, Result > + reduceWithArgument( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); }; template<> -class Reduction< Devices::Cuda > +struct Reduction< Devices::Cuda > { - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static Result + reduce( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); -}; - -template<> -class Reduction< Devices::MIC > -{ - public: - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static Result - reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); - - template< typename Index, - typename Result, - typename ReductionOperation, - typename VolatileReductionOperation, - typename DataFetcher > - static std::pair< Index, Result > - reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, - DataFetcher& dataFetcher, - const Result& zero ); + template< typename Index, + typename Result, + typename ReductionOperation, + typename DataFetcher > + static std::pair< Index, Result > + reduceWithArgument( const Index size, + const ReductionOperation& reduction, + DataFetcher& dataFetcher, + const Result& zero ); }; } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/Reduction.hpp b/src/TNL/Containers/Algorithms/Reduction.hpp index 9d54fd7b1e77c404f4bbfc89dafd3d0c99d9ff24..229af13797f82a4f27f67bc81a5bfb6886a65604 100644 --- a/src/TNL/Containers/Algorithms/Reduction.hpp +++ b/src/TNL/Containers/Algorithms/Reduction.hpp @@ -12,13 +12,11 @@ #pragma once +#include // std::unique_ptr //#define CUDA_REDUCTION_PROFILING -#include -#include #include -#include #include #include @@ -43,58 +41,54 @@ static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//25 template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Host >:: reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { - using IndexType = Index; - using ResultType = Result; - constexpr int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable - ResultType result = zero; -#pragma omp parallel + Result result = zero; + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) { // initialize array for thread-local results - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Result r[ 4 ] = { zero, zero, zero, zero }; #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - reduction( r[ 0 ], dataFetcher( offset + i ) ); - reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { - for( IndexType i = blocks * block_size; i < size; i++ ) - reduction( r[ 0 ], dataFetcher( i ) ); + for( Index i = blocks * block_size; i < size; i++ ) + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); } // local reduction of unrolled results - reduction( r[ 0 ], r[ 2 ] ); - reduction( r[ 1 ], r[ 3 ] ); - reduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); // inter-thread reduction of local results #pragma omp critical { - reduction( result, r[ 0 ] ); + result = reduction( result, r[ 0 ] ); } } return result; @@ -103,34 +97,33 @@ reduce( const Index size, #endif if( blocks > 1 ) { // initialize array for unrolled results - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Result r[ 4 ] = { zero, zero, zero, zero }; // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { - reduction( r[ 0 ], dataFetcher( offset + i ) ); - reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); - reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); - reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); + r[ 0 ] = reduction( r[ 0 ], dataFetcher( offset + i ) ); + r[ 1 ] = reduction( r[ 1 ], dataFetcher( offset + i + 1 ) ); + r[ 2 ] = reduction( r[ 2 ], dataFetcher( offset + i + 2 ) ); + r[ 3 ] = reduction( r[ 3 ], dataFetcher( offset + i + 3 ) ); } } // reduction of the last, incomplete block (not unrolled) - for( IndexType i = blocks * block_size; i < size; i++ ) - reduction( r[ 0 ], dataFetcher( i ) ); - //operation.dataFetcher( r[ 0 ], i, input1, input2 ); + for( Index i = blocks * block_size; i < size; i++ ) + r[ 0 ] = reduction( r[ 0 ], dataFetcher( i ) ); // reduction of unrolled results - reduction( r[ 0 ], r[ 2 ] ); - reduction( r[ 1 ], r[ 3 ] ); - reduction( r[ 0 ], r[ 1 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 2 ] ); + r[ 1 ] = reduction( r[ 1 ], r[ 3 ] ); + r[ 0 ] = reduction( r[ 0 ], r[ 1 ] ); return r[ 0 ]; } else { - ResultType result = zero; - for( IndexType i = 0; i < size; i++ ) - reduction( result, dataFetcher( i ) ); + Result result = zero; + for( Index i = 0; i < size; i++ ) + result = reduction( result, dataFetcher( i ) ); return result; } #ifdef HAVE_OPENMP @@ -141,36 +134,32 @@ reduce( const Index size, template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > std::pair< Index, Result > Reduction< Devices::Host >:: reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { - using IndexType = Index; - using ResultType = Result; - constexpr int block_size = 128; const int blocks = size / block_size; #ifdef HAVE_OPENMP - if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { + if( Devices::Host::isOMPEnabled() && blocks >= 2 ) { // global result variable std::pair< Index, Result > result( -1, zero ); -#pragma omp parallel + const int threads = TNL::min( blocks, Devices::Host::getMaxThreadsCount() ); +#pragma omp parallel num_threads(threads) { // initialize array for thread-local results - IndexType arg[ 4 ] = { 0, 0, 0, 0 }; - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Index arg[ 4 ] = { 0, 0, 0, 0 }; + Result r[ 4 ] = { zero, zero, zero, zero }; bool initialized( false ); #pragma omp for nowait for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { if( ! initialized ) { arg[ 0 ] = offset + i; @@ -194,7 +183,7 @@ reduceWithArgument( const Index size, // the first thread that reaches here processes the last, incomplete block #pragma omp single nowait { - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); } @@ -217,13 +206,13 @@ reduceWithArgument( const Index size, #endif if( blocks > 1 ) { // initialize array for unrolled results - IndexType arg[ 4 ] = { 0, 0, 0, 0 }; - ResultType r[ 4 ] = { zero, zero, zero, zero }; + Index arg[ 4 ] = { 0, 0, 0, 0 }; + Result r[ 4 ] = { zero, zero, zero, zero }; bool initialized( false ); // main reduction (explicitly unrolled loop) for( int b = 0; b < blocks; b++ ) { - const IndexType offset = b * block_size; + const Index offset = b * block_size; for( int i = 0; i < block_size; i += 4 ) { if( ! initialized ) { @@ -246,7 +235,7 @@ reduceWithArgument( const Index size, } // reduction of the last, incomplete block (not unrolled) - for( IndexType i = blocks * block_size; i < size; i++ ) + for( Index i = blocks * block_size; i < size; i++ ) reduction( arg[ 0 ], i, r[ 0 ], dataFetcher( i ) ); // reduction of unrolled results @@ -257,7 +246,7 @@ reduceWithArgument( const Index size, } else { std::pair< Index, Result > result( 0, dataFetcher( 0 ) ); - for( IndexType i = 1; i < size; i++ ) + for( Index i = 1; i < size; i++ ) reduction( result.first, i, result.second, dataFetcher( i ) ); return result; } @@ -269,28 +258,18 @@ reduceWithArgument( const Index size, template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > Result Reduction< Devices::Cuda >:: reduce( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { -#ifdef HAVE_CUDA - - using IndexType = Index; - using ResultType = Result; - - /*** - * Only fundamental and pointer types can be safely reduced on host. Complex - * objects stored on the device might contain pointers into the device memory, - * in which case reduction on host might fail. - */ - //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; - constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; + // Only fundamental and pointer types can be safely reduced on host. Complex + // objects stored on the device might contain pointers into the device memory, + // in which case reduction on host might fail. + constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value; #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -298,18 +277,16 @@ reduce( const Index size, timer.start(); #endif - CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + CudaReductionKernelLauncher< Index, Result > reductionLauncher( size ); - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1( 0 ); - IndexType reducedSize = reductionLauncher.start( + // start the reduction on the GPU + Result* deviceAux1( 0 ); + const int reducedSize = reductionLauncher.start( reduction, - volatileReduction, dataFetcher, zero, deviceAux1 ); + #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -318,10 +295,21 @@ reduce( const Index size, #endif if( can_reduce_later_on_host ) { - /*** - * Transfer the reduced data from device to host. - */ - std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Result[ static_cast(reducedSize) ] + #else + new Result[ reducedSize ] + #endif + }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); #ifdef CUDA_REDUCTION_PROFILING @@ -331,11 +319,9 @@ reduce( const Index size, timer.start(); #endif - /*** - * Reduce the data on the host system. - */ - auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - const ResultType result = Reduction< Devices::Host >::reduce( reducedSize, reduction, volatileReduction, fetch, zero ); + // finish the reduction on the host + auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; + const Result result = Reduction< Devices::Host >::reduce( reducedSize, reduction, fetch, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -344,10 +330,8 @@ reduce( const Index size, return result; } else { - /*** - * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. - */ - auto result = reductionLauncher.finish( reduction, volatileReduction, zero ); + // data can't be safely reduced on host, so continue with the reduction on the GPU + auto result = reductionLauncher.finish( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -358,36 +342,23 @@ reduce( const Index size, return result; } -#else - throw Exceptions::CudaSupportMissing(); -#endif -}; +} template< typename Index, typename Result, typename ReductionOperation, - typename VolatileReductionOperation, typename DataFetcher > std::pair< Index, Result > Reduction< Devices::Cuda >:: reduceWithArgument( const Index size, - ReductionOperation& reduction, - VolatileReductionOperation& volatileReduction, + const ReductionOperation& reduction, DataFetcher& dataFetcher, const Result& zero ) { -#ifdef HAVE_CUDA - - using IndexType = Index; - using ResultType = Result; - - /*** - * Only fundamental and pointer types can be safely reduced on host. Complex - * objects stored on the device might contain pointers into the device memory, - * in which case reduction on host might fail. - */ - //constexpr bool can_reduce_all_on_host = std::is_fundamental< DataType1 >::value || std::is_fundamental< DataType2 >::value || std::is_pointer< DataType1 >::value || std::is_pointer< DataType2 >::value; - constexpr bool can_reduce_later_on_host = std::is_fundamental< ResultType >::value || std::is_pointer< ResultType >::value; + // Only fundamental and pointer types can be safely reduced on host. Complex + // objects stored on the device might contain pointers into the device memory, + // in which case reduction on host might fail. + constexpr bool can_reduce_later_on_host = std::is_fundamental< Result >::value || std::is_pointer< Result >::value; #ifdef CUDA_REDUCTION_PROFILING Timer timer; @@ -395,20 +366,18 @@ reduceWithArgument( const Index size, timer.start(); #endif - CudaReductionKernelLauncher< IndexType, ResultType > reductionLauncher( size ); + CudaReductionKernelLauncher< Index, Result > reductionLauncher( size ); - /**** - * Reduce the data on the CUDA device. - */ - ResultType* deviceAux1( nullptr ); - IndexType* deviceIndexes( nullptr ); - IndexType reducedSize = reductionLauncher.startWithArgument( + // start the reduction on the GPU + Result* deviceAux1( nullptr ); + Index* deviceIndexes( nullptr ); + const int reducedSize = reductionLauncher.startWithArgument( reduction, - volatileReduction, dataFetcher, zero, deviceAux1, deviceIndexes ); + #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction on GPU to size " << reducedSize << " took " << timer.getRealTime() << " sec. " << std::endl; @@ -417,11 +386,35 @@ reduceWithArgument( const Index size, #endif if( can_reduce_later_on_host ) { - /*** - * Transfer the reduced data from device to host. - */ - std::unique_ptr< ResultType[] > resultArray{ new ResultType[ reducedSize ] }; - std::unique_ptr< IndexType[] > indexArray{ new IndexType[ reducedSize ] }; + // transfer the reduced data from device to host + std::unique_ptr< Result[] > resultArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Result[reducedSize]` in the source code to `new (Result[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Result[ static_cast(reducedSize) ] + #else + new Result[ reducedSize ] + #endif + }; + std::unique_ptr< Index[] > indexArray{ + // Workaround for nvcc 10.1.168 - it would modifie the simple expression + // `new Index[reducedSize]` in the source code to `new (Index[reducedSize])` + // which is not correct - see e.g. https://stackoverflow.com/a/39671946 + // Thus, the host compiler would spit out hundreds of warnings... + // Funnily enough, nvcc's behaviour depends on the context rather than the + // expression, because exactly the same simple expression in different places + // does not produce warnings. + #ifdef __NVCC__ + new Index[ static_cast(reducedSize) ] + #else + new Index[ reducedSize ] + #endif + }; ArrayOperations< Devices::Host, Devices::Cuda >::copy( resultArray.get(), deviceAux1, reducedSize ); ArrayOperations< Devices::Host, Devices::Cuda >::copy( indexArray.get(), deviceIndexes, reducedSize ); @@ -432,12 +425,10 @@ reduceWithArgument( const Index size, timer.start(); #endif - /*** - * Reduce the data on the host system. - */ - //auto fetch = [&] ( IndexType i ) { return resultArray[ i ]; }; - //const ResultType result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, volatileReduction, fetch, zero ); - for( IndexType i = 1; i < reducedSize; i++ ) + // finish the reduction on the host +// auto fetch = [&] ( Index i ) { return resultArray[ i ]; }; +// const Result result = Reduction< Devices::Host >::reduceWithArgument( reducedSize, argument, reduction, fetch, zero ); + for( Index i = 1; i < reducedSize; i++ ) reduction( indexArray[ 0 ], indexArray[ i ], resultArray[ 0 ], resultArray[ i ] ); #ifdef CUDA_REDUCTION_PROFILING @@ -447,10 +438,8 @@ reduceWithArgument( const Index size, return std::make_pair( indexArray[ 0 ], resultArray[ 0 ] ); } else { - /*** - * Data can't be safely reduced on host, so continue with the reduction on the CUDA device. - */ - auto result = reductionLauncher.finishWithArgument( reduction, volatileReduction, zero ); + // data can't be safely reduced on host, so continue with the reduction on the GPU + auto result = reductionLauncher.finishWithArgument( reduction, zero ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); @@ -461,9 +450,6 @@ reduceWithArgument( const Index size, return result; } -#else - throw Exceptions::CudaSupportMissing(); -#endif } } // namespace Algorithms diff --git a/src/TNL/Containers/Algorithms/ReductionOperations.h b/src/TNL/Containers/Algorithms/ReductionOperations.h deleted file mode 100644 index 33ef84b1c7f2fb0f9cb5ea4b72fdb9776fcce08c..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/ReductionOperations.h +++ /dev/null @@ -1,641 +0,0 @@ -/*************************************************************************** - ReductionOperations.h - description - ------------------- - begin : Mar 22, 2013 - copyright : (C) 2013 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include // std::numeric_limits - -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -/* - * Unary operations: reduction on one input vector. - */ - -template< typename Data, typename Result = Data > -class ParallelReductionSum -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const ResultType& data ) - { - result += data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const ResultType& data ) - { - result += data; - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionMin -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, data1[ index ] ); - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = TNL::min( result, data ); - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = TNL::min( result, data ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionMax -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, data1[ index ] ); - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = TNL::max( result, data ); - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = TNL::max( result, data ); - } -}; - -template< typename Data, typename Result = bool > -class ParallelReductionLogicalAnd -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return true; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = result && data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = result && data; - } -}; - - -template< typename Data, typename Result = bool > -class ParallelReductionLogicalOr -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalOr< Result >; - - static constexpr Result initialValue() { return false; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result || data1[ index ]; - } - - __cuda_callable__ void - commonReduction( ResultType& result, - const Result& data ) - { - result = result || data; - } - - __cuda_callable__ void - commonReduction( volatile ResultType& result, - volatile const Result& data ) - { - result = result || data; - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsSum : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::abs( data1[ index ] ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsMin : public ParallelReductionMin< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, TNL::abs( data1[ index ] ) ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionAbsMax : public ParallelReductionMax< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, TNL::abs( data1[ index ] ) ); - } -}; - -template< typename Data, typename Result = Data > -class ParallelReductionL2Norm : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - const Data& aux = data1[ index ]; - result += aux * aux; - } -}; - - -template< typename Data, typename Result = Data, typename PType = Data > -class ParallelReductionLpNorm : public ParallelReductionSum< Data, Result > -{ -public: - using DataType1 = Data; - using DataType2 = void; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - void setPower( const PType p ) - { - this->p = p; - } - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::pow( TNL::abs( data1[ index ] ), p ); - } - -protected: - PType p; -}; - - -/* - * Binary operations: reduction on two input vectors. - */ - -template< typename Data1, typename Data2, typename Result = bool > -class ParallelReductionEqualities : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return true; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] == data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = bool > -class ParallelReductionInequalities : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - static constexpr Result initialValue() { return false; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] != data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionScalarProduct : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ] * data2[ index ]; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffSum : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += data1[ index ] - data2[ index ]; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffMin : public ParallelReductionMin< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffMax : public ParallelReductionMax< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::lowest(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsSum : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::abs( data1[ index ] - data2[ index ] ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsMin : public ParallelReductionMin< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMin< Result >; - - static constexpr Result initialValue() { return std::numeric_limits< Result >::max(); }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::min( result, TNL::abs( data1[ index ] - data2[ index ] ) ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffAbsMax : public ParallelReductionMax< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionMax< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = TNL::max( result, TNL::abs( data1[ index ] - data2[ index ] ) ); - } -}; - -template< typename Data1, typename Data2, typename Result = Data1 > -class ParallelReductionDiffL2Norm : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - const ResultType aux = data2[ index ] - data1[ index ]; - result += aux * aux; - } -}; - -template< typename Data1, typename Data2, typename Result = Data1, typename PType = Data1 > -class ParallelReductionDiffLpNorm : public ParallelReductionSum< Result, Result > -{ -public: - using DataType1 = Data1; - using DataType2 = Data2; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionSum< Result >; - - void setPower( const PType p ) - { - this->p = p; - } - - static constexpr Result initialValue() { return 0; }; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result += TNL::pow( TNL::abs( data1[ index ] - data2[ index ] ), p ); - } - -protected: - PType p; -}; - -template< typename Data, typename Result = bool > -class ParallelReductionContainsValue : public ParallelReductionLogicalOr< Result > -{ -public: - using DataType1 = Data; - using DataType2 = Data; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalOr< Result >; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result || ( data1[ index ] == value ); - } - - void setValue( const Data& v ) - { - this->value = v; - } - -protected: - Data value; -}; - -template< typename Data, typename Result = bool > -class ParallelReductionContainsOnlyValue : public ParallelReductionLogicalAnd< Result > -{ -public: - using DataType1 = Data; - using DataType2 = Data; - using ResultType = Result; - using LaterReductionOperation = ParallelReductionLogicalAnd< Result >; - - template< typename Index > - __cuda_callable__ void - firstReduction( ResultType& result, - const Index& index, - const DataType1* data1, - const DataType2* data2 ) - { - result = result && ( data1[ index ] == value ); - } - - void setValue( const Data& v ) - { - this->value = v; - } - -protected: - Data value; -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/VectorAssignment.h b/src/TNL/Containers/Algorithms/VectorAssignment.h index 36829be82a70c2ff74f39d275c6a411c21e34873..c861579f4e90810b024d2d84b5ea3c2eeaf92234 100644 --- a/src/TNL/Containers/Algorithms/VectorAssignment.h +++ b/src/TNL/Containers/Algorithms/VectorAssignment.h @@ -12,7 +12,6 @@ #include #include -#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Algorithms/VectorOperations.h b/src/TNL/Containers/Algorithms/VectorOperations.h deleted file mode 100644 index a12b310e36b7d9bf2b458bceaf9db1547162863c..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/VectorOperations.h +++ /dev/null @@ -1,64 +0,0 @@ -/*************************************************************************** - VectorOperations.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< typename Device > -class VectorOperations{}; - -template<> -class VectorOperations< Devices::Host > -{ -public: - template< Algorithms::PrefixSumType Type, - typename Vector > - static void prefixSum( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); - - template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > - static void segmentedPrefixSum( Vector& v, - Flags& f, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); -}; - -template<> -class VectorOperations< Devices::Cuda > -{ -public: - template< Algorithms::PrefixSumType Type, - typename Vector > - static void prefixSum( Vector& v, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); - - template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > - static void segmentedPrefixSum( Vector& v, - Flags& f, - const typename Vector::IndexType begin, - const typename Vector::IndexType end ); -}; - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL - -#include -#include diff --git a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h deleted file mode 100644 index 4b53dbcf73eeb8b90a5388bbe7b1d443ff24574d..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +++ /dev/null @@ -1,58 +0,0 @@ -/*************************************************************************** - VectorOperationsCuda_impl.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< Algorithms::PrefixSumType Type, - typename Vector > -void -VectorOperations< Devices::Cuda >:: -prefixSum( Vector& v, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - PrefixSum< Devices::Cuda, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > -void -VectorOperations< Devices::Cuda >:: -segmentedPrefixSum( Vector& v, - Flags& f, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - SegmentedPrefixSum< Devices::Cuda, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h b/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h deleted file mode 100644 index 50d591b9f31d11097eeb1047bdba465dda906b95..0000000000000000000000000000000000000000 --- a/src/TNL/Containers/Algorithms/VectorOperationsHost_impl.h +++ /dev/null @@ -1,55 +0,0 @@ -/*************************************************************************** - VectorOperationsHost_impl.h - description - ------------------- - begin : Nov 8, 2012 - copyright : (C) 2012 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include -#include - -namespace TNL { -namespace Containers { -namespace Algorithms { - -template< Algorithms::PrefixSumType Type, typename Vector > -void -VectorOperations< Devices::Host >:: -prefixSum( Vector& v, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - PrefixSum< Devices::Host, Type >::perform( v, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -template< Algorithms::PrefixSumType Type, typename Vector, typename Flags > -void -VectorOperations< Devices::Host >:: -segmentedPrefixSum( Vector& v, - Flags& f, - typename Vector::IndexType begin, - typename Vector::IndexType end ) -{ - using RealType = typename Vector::RealType; - using IndexType = typename Vector::IndexType; - - auto reduction = [=] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - - SegmentedPrefixSum< Devices::Host, Type >::perform( v, f, begin, end, reduction, volatileReduction, ( RealType ) 0.0 ); -} - -} // namespace Algorithms -} // namespace Containers -} // namespace TNL diff --git a/src/TNL/Containers/DistributedVector.h b/src/TNL/Containers/DistributedVector.h index 75f204dccba363da9b5d079091680bd4272f03df..3438ddbd00378c0e3870a49b60f8151ef165a810 100644 --- a/src/TNL/Containers/DistributedVector.h +++ b/src/TNL/Containers/DistributedVector.h @@ -127,13 +127,8 @@ public: typename = std::enable_if_t< HasSubscriptOperator::value > > DistributedVector& operator/=( const Vector& vector ); - void computePrefixSum(); - - void computePrefixSum( IndexType begin, IndexType end ); - - void computeExclusivePrefixSum(); - - void computeExclusivePrefixSum( IndexType begin, IndexType end ); + template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive > + void prefixSum( IndexType begin = 0, IndexType end = 0 ); }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedVector.hpp b/src/TNL/Containers/DistributedVector.hpp index 73781c90640eebe0270082174e0a871b1fce511d..0a6ac1547f1f1b6f6fae3b92d1dcd7a2804a83b9 100644 --- a/src/TNL/Containers/DistributedVector.hpp +++ b/src/TNL/Containers/DistributedVector.hpp @@ -13,8 +13,7 @@ #pragma once #include "DistributedVector.h" -#include -#include +#include namespace TNL { namespace Containers { @@ -299,44 +298,14 @@ template< typename Real, typename Device, typename Index, typename Communicator > + template< Algorithms::PrefixSumType Type > void DistributedVector< Real, Device, Index, Communicator >:: -computePrefixSum() +prefixSum( IndexType begin, IndexType end ) { - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum() -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVector< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); + if( end == 0 ) + end = this->getSize(); + Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } } // namespace Containers diff --git a/src/TNL/Containers/DistributedVectorView.h b/src/TNL/Containers/DistributedVectorView.h index 0ab9232001f84c7ea954f02dd21d55830b414cad..f0e7d91278f06f4ecf1f2688083b79c3e2ca66e9 100644 --- a/src/TNL/Containers/DistributedVectorView.h +++ b/src/TNL/Containers/DistributedVectorView.h @@ -127,13 +127,8 @@ public: typename = std::enable_if_t< HasSubscriptOperator::value > > DistributedVectorView& operator/=( const Vector& vector ); - void computePrefixSum(); - - void computePrefixSum( IndexType begin, IndexType end ); - - void computeExclusivePrefixSum(); - - void computeExclusivePrefixSum( IndexType begin, IndexType end ); + template< Algorithms::PrefixSumType Type = Algorithms::PrefixSumType::Inclusive > + void prefixSum( IndexType begin = 0, IndexType end = 0 ); }; } // namespace Containers diff --git a/src/TNL/Containers/DistributedVectorView.hpp b/src/TNL/Containers/DistributedVectorView.hpp index 38715161cc46f25b2aa705b3cdb6c302aa1cad38..0268e35da5b2dc9ea1eb24dbec738015abe64a84 100644 --- a/src/TNL/Containers/DistributedVectorView.hpp +++ b/src/TNL/Containers/DistributedVectorView.hpp @@ -13,8 +13,7 @@ #pragma once #include "DistributedVectorView.h" -#include -#include +#include namespace TNL { namespace Containers { @@ -275,44 +274,14 @@ template< typename Real, typename Device, typename Index, typename Communicator > + template< Algorithms::PrefixSumType Type > void DistributedVectorView< Real, Device, Index, Communicator >:: -computePrefixSum() +prefixSum( IndexType begin, IndexType end ) { - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum() -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); -} - -template< typename Real, - typename Device, - typename Index, - typename Communicator > -void -DistributedVectorView< Real, Device, Index, Communicator >:: -computeExclusivePrefixSum( IndexType begin, IndexType end ) -{ - throw Exceptions::NotImplementedError("Distributed prefix sum is not implemented yet."); + if( end == 0 ) + end = this->getSize(); + Algorithms::DistributedPrefixSum< Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } } // namespace Containers diff --git a/src/TNL/Containers/Expressions/Comparison.h b/src/TNL/Containers/Expressions/Comparison.h index ff533e7813028f22914a129575fbe31bccebae14..616ad5807864a1b1f2cd7b5af765132021c99a24 100644 --- a/src/TNL/Containers/Expressions/Comparison.h +++ b/src/TNL/Containers/Expressions/Comparison.h @@ -65,9 +65,7 @@ struct VectorComparison< T1, T2, false > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -97,9 +95,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -112,9 +108,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -127,9 +121,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -142,9 +134,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, VectorExpressionVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -160,9 +150,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a == b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -176,9 +164,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a > b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -187,9 +173,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a >= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -198,9 +182,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a < b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -209,9 +191,7 @@ struct Comparison< T1, T2, ArithmeticVariable, VectorExpressionVariable > using IndexType = typename T2::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a <= b[ i ]; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( b.getSize(), std::logical_and<>{}, fetch, true ); } }; @@ -227,9 +207,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] == b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool NE( const T1& a, const T2& b ) @@ -243,9 +221,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] > b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool GE( const T1& a, const T2& b ) @@ -254,9 +230,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] >= b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LT( const T1& a, const T2& b ) @@ -265,9 +239,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] < b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } static bool LE( const T1& a, const T2& b ) @@ -276,9 +248,7 @@ struct Comparison< T1, T2, VectorExpressionVariable, ArithmeticVariable > using IndexType = typename T1::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) -> bool { return a[ i ] <= b; }; - auto reduction = [=] __cuda_callable__ ( bool& a, const bool& b ) { a &= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile bool& a, volatile bool& b ) { a &= b; }; - return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), reduction, volatileReduction, fetch, true ); + return Algorithms::Reduction< DeviceType >::reduce( a.getSize(), std::logical_and<>{}, fetch, true ); } }; diff --git a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h index 405b0014ac77a5287bff0680c78b671c8104ab51..1d3077b432303bf5ecddc7f4ff48b71d1a5b9c46 100644 --- a/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/DistributedExpressionTemplates.h @@ -2187,12 +2187,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2201,19 +2199,17 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2222,7 +2218,7 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2232,12 +2228,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2250,19 +2244,17 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2275,7 +2267,7 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2285,12 +2277,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2303,19 +2293,17 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::DistributedUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2328,7 +2316,7 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } } // namespace TNL diff --git a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h index 4e569a45d46ea5db0a7bb24e9418ae064e30d5ac..43940417bb0aa7fa97afe17e4fee3e4ce703a819 100644 --- a/src/TNL/Containers/Expressions/DistributedVerticalOperations.h +++ b/src/TNL/Containers/Expressions/DistributedVerticalOperations.h @@ -70,15 +70,7 @@ auto DistributedExpressionArgMin( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) { - if( a > b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::max() ); + result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::max() ); result.first = gatheredResults[ result.first ].first; } return result; @@ -135,15 +127,7 @@ auto DistributedExpressionArgMax( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [] ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile RealType& a, volatile RealType& b ) { - if( a < b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, volatileReduction, fetch, std::numeric_limits< RealType >::lowest() ); + result = Algorithms::Reduction< Devices::Host >::reduceWithArgument( (IndexType) nproc, reduction, fetch, std::numeric_limits< RealType >::lowest() ); result.first = gatheredResults[ result.first ].first; } return result; diff --git a/src/TNL/Containers/Expressions/ExpressionTemplates.h b/src/TNL/Containers/Expressions/ExpressionTemplates.h index 5a069b0b9958bf794b204e1236f530201dd4748b..f01d8a68e153f919871106d0039d6807f575fe1f 100644 --- a/src/TNL/Containers/Expressions/ExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/ExpressionTemplates.h @@ -2110,12 +2110,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2124,19 +2122,17 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2145,7 +2141,7 @@ Result evaluateAndReduce( Vector& lhs, RealType* lhs_data = lhs.getData(); auto fetch = [=] __cuda_callable__ ( IndexType i ) -> RealType { return ( lhs_data[ i ] = expression[ i ] ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2155,12 +2151,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2173,19 +2167,17 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduce( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2198,7 +2190,7 @@ Result addAndReduce( Vector& lhs, lhs_data[ i ] += aux; return aux; }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } //// @@ -2208,12 +2200,10 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::BinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2226,19 +2216,17 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::UnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { using RealType = typename Vector::RealType; @@ -2251,7 +2239,7 @@ Result addAndReduceAbs( Vector& lhs, lhs_data[ i ] += aux; return TNL::abs( aux ); }; - return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, volatileReduction, fetch, zero ); + return Containers::Algorithms::Reduction< DeviceType >::reduce( lhs.getSize(), reduction, fetch, zero ); } } // namespace TNL diff --git a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h index cce42592306b71cb6ca513b1e6efacbf29d1f5f6..1bafe7cfc0932be98839f7eee2fc99784dfaa55b 100644 --- a/src/TNL/Containers/Expressions/StaticExpressionTemplates.h +++ b/src/TNL/Containers/Expressions/StaticExpressionTemplates.h @@ -2276,18 +2276,16 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) - reduction( result, lhs[ i ] = expression[ i ] ); + result = reduction( result, lhs[ i ] = expression[ i ] ); return result; } @@ -2295,18 +2293,16 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result evaluateAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) - reduction( result, lhs[ i ] = expression[ i ] ); + result = reduction( result, lhs[ i ] = expression[ i ] ); return result; } @@ -2317,20 +2313,18 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, aux ); + result = reduction( result, aux ); } return result; } @@ -2339,20 +2333,18 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduce( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, aux ); + result = reduction( result, aux ); } return result; } @@ -2364,20 +2356,18 @@ template< typename Vector, typename T2, template< typename, typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticBinaryExpressionTemplate< T1, T2, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, TNL::abs( aux ) ); + result = reduction( result, TNL::abs( aux ) ); } return result; } @@ -2386,20 +2376,18 @@ template< typename Vector, typename T1, template< typename > class Operation, typename Reduction, - typename VolatileReduction, typename Result > __cuda_callable__ Result addAndReduceAbs( Vector& lhs, const Containers::Expressions::StaticUnaryExpressionTemplate< T1, Operation >& expression, - Reduction& reduction, - VolatileReduction& volatileReduction, + const Reduction& reduction, const Result& zero ) { Result result( zero ); for( int i = 0; i < Vector::getSize(); i++ ) { const Result aux = expression[ i ]; lhs[ i ] += aux; - reduction( result, TNL::abs( aux ) ); + result = reduction( result, TNL::abs( aux ) ); } return result; } diff --git a/src/TNL/Containers/Expressions/VerticalOperations.h b/src/TNL/Containers/Expressions/VerticalOperations.h index a1250578084b4f95d6c4de23884d9396fea397b0..29e904bbfafb5c338a523227bb9e226d4fda9970 100644 --- a/src/TNL/Containers/Expressions/VerticalOperations.h +++ b/src/TNL/Containers/Expressions/VerticalOperations.h @@ -32,9 +32,8 @@ auto ExpressionMin( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a < b ? a : b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a < b ? a : b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::min( a, b ); }; + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -45,7 +44,7 @@ auto ExpressionArgMin( const Expression& expression ) using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { + auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { if( a > b ) { a = b; aIdx = bIdx; @@ -53,15 +52,7 @@ auto ExpressionArgMin( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { - if( a > b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -71,9 +62,8 @@ auto ExpressionMax( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a > b ? a : b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a > b ? a : b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + auto reduction = [] __cuda_callable__ ( const ResultType& a, const ResultType& b ) { return TNL::max( a, b ); }; + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Expression > @@ -84,15 +74,7 @@ auto ExpressionArgMax( const Expression& expression ) using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { - if( a < b ) { - a = b; - aIdx = bIdx; - } - else if( a == b && bIdx < aIdx ) - aIdx = bIdx; - }; - auto volatileReduction = [=] __cuda_callable__ ( volatile IndexType& aIdx, volatile IndexType& bIdx, volatile ResultType& a, volatile ResultType& b ) { + auto reduction = [] __cuda_callable__ ( IndexType& aIdx, const IndexType& bIdx, ResultType& a, const ResultType& b ) { if( a < b ) { a = b; aIdx = bIdx; @@ -100,7 +82,7 @@ auto ExpressionArgMax( const Expression& expression ) else if( a == b && bIdx < aIdx ) aIdx = bIdx; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::lowest() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduceWithArgument( expression.getSize(), reduction, fetch, std::numeric_limits< ResultType >::lowest() ); } template< typename Expression > @@ -110,9 +92,7 @@ auto ExpressionSum( const Expression& expression ) -> std::decay_t< decltype( ex using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -122,9 +102,7 @@ auto ExpressionL1Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::abs( expression[ i ] ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -134,9 +112,7 @@ auto ExpressionL2Norm( const Expression& expression ) -> std::decay_t< decltype( using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ] * expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression, typename Real > @@ -146,9 +122,7 @@ auto ExpressionLpNorm( const Expression& expression, const Real& p ) -> std::dec using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return TNL::pow( TNL::abs( expression[ i ] ), p ); }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a += b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a += b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::plus<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -158,9 +132,7 @@ auto ExpressionProduct( const Expression& expression ) -> std::decay_t< decltype using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a *= b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a *= b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 1 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::multiplies<>{}, fetch, (ResultType) 1 ); } template< typename Expression > @@ -170,9 +142,7 @@ auto ExpressionLogicalAnd( const Expression& expression ) -> std::decay_t< declt using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a && b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a && b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_and<>{}, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -182,9 +152,7 @@ auto ExpressionLogicalOr( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a || b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a || b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::logical_or<>{}, fetch, (ResultType) 0 ); } template< typename Expression > @@ -194,9 +162,7 @@ auto ExpressionBinaryAnd( const Expression& expression ) -> std::decay_t< declty using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a & b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a & b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, std::numeric_limits< ResultType >::max() ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_and<>{}, fetch, std::numeric_limits< ResultType >::max() ); } template< typename Expression > @@ -206,9 +172,7 @@ auto ExpressionBinaryOr( const Expression& expression ) -> std::decay_t< decltyp using IndexType = typename Expression::IndexType; auto fetch = [=] __cuda_callable__ ( IndexType i ) { return expression[ i ]; }; - auto reduction = [=] __cuda_callable__ ( ResultType& a, const ResultType& b ) { a = a | b; }; - auto volatileReduction = [=] __cuda_callable__ ( volatile ResultType& a, volatile ResultType& b ) { a = a | b; }; - return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), reduction, volatileReduction, fetch, (ResultType) 0 ); + return Algorithms::Reduction< typename Expression::DeviceType >::reduce( expression.getSize(), std::bit_or<>{}, fetch, (ResultType) 0 ); } } // namespace Expressions diff --git a/src/TNL/Containers/Vector.hpp b/src/TNL/Containers/Vector.hpp index 7338c8317e4a5b29297240ff40d9de8ae73ff194..a8c626ee5e56925d9e1b9331bde87ddbdeff3a59 100644 --- a/src/TNL/Containers/Vector.hpp +++ b/src/TNL/Containers/Vector.hpp @@ -175,7 +175,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end ); + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, @@ -190,7 +190,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end ); + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, diff --git a/src/TNL/Containers/VectorView.h b/src/TNL/Containers/VectorView.h index 1662afb7c815892faf2b336d065a2fdf861dfb70..0d03954549d49773e0c68c685d91fd048b1a7b5a 100644 --- a/src/TNL/Containers/VectorView.h +++ b/src/TNL/Containers/VectorView.h @@ -14,7 +14,7 @@ #include #include -#include +#include namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/VectorView.hpp b/src/TNL/Containers/VectorView.hpp index 986d2bc0fbde88de1d57670f64a6b6e08a176f10..9cdde8ef2a49422ee858ff87bb5634a8db075460 100644 --- a/src/TNL/Containers/VectorView.hpp +++ b/src/TNL/Containers/VectorView.hpp @@ -11,8 +11,6 @@ #pragma once #include -#include -#include #include #include @@ -127,7 +125,7 @@ prefixSum( IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template prefixSum< Type >( *this, begin, end ); + Algorithms::PrefixSum< DeviceType, Type >::perform( *this, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, @@ -141,7 +139,7 @@ segmentedPrefixSum( FlagsArray& flags, IndexType begin, IndexType end ) { if( end == 0 ) end = this->getSize(); - Algorithms::VectorOperations< Device >::template segmentedPrefixSum< Type >( *this, flags, begin, end ); + Algorithms::SegmentedPrefixSum< DeviceType, Type >::perform( *this, flags, begin, end, std::plus<>{}, (RealType) 0.0 ); } template< typename Real, diff --git a/src/TNL/Devices/Host.h b/src/TNL/Devices/Host.h index 649749f588143ab80793e64c3d93e03bf2147229..40f55711a817e684f379e442ab30cdec485be013 100644 --- a/src/TNL/Devices/Host.h +++ b/src/TNL/Devices/Host.h @@ -113,17 +113,19 @@ public: } protected: - static bool& ompEnabled() { + static bool& ompEnabled() + { #ifdef HAVE_OPENMP - static bool ompEnabled( true ); + static bool ompEnabled = true; #else - static bool ompEnabled( false ); + static bool ompEnabled = false; #endif return ompEnabled; } - static int& maxThreadsCount() { - static int maxThreadsCount( -1 ); + static int& maxThreadsCount() + { + static int maxThreadsCount = -1; return maxThreadsCount; } }; diff --git a/src/TNL/Solvers/Linear/GMRES.h b/src/TNL/Solvers/Linear/GMRES.h index 1cc6901dc5fd142e53b4f4a1e3f458716bba0602..dd72e2832af81b65b9fdcc8d19090d669232bc62 100644 --- a/src/TNL/Solvers/Linear/GMRES.h +++ b/src/TNL/Solvers/Linear/GMRES.h @@ -84,9 +84,16 @@ protected: void hauseholder_cwy( VectorViewType v, const int i ); +// nvcc allows __cuda_callable__ lambdas only in public methods +#ifdef __NVCC__ +public: +#endif void hauseholder_cwy_transposed( VectorViewType z, const int i, ConstVectorViewType w ); +#ifdef __NVCC__ +protected: +#endif template< typename Vector > void update( const int k, diff --git a/src/TNL/Solvers/Linear/GMRES_impl.h b/src/TNL/Solvers/Linear/GMRES_impl.h index f45741151b03c8fadcd695c09d94abd168c12c7f..7f316d730b1bdad4f5439defd093d6140670c58e 100644 --- a/src/TNL/Solvers/Linear/GMRES_impl.h +++ b/src/TNL/Solvers/Linear/GMRES_impl.h @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -427,14 +426,25 @@ hauseholder_generate( const int i, if( i > 0 ) { // aux = Y_{i-1}^T * y_i RealType aux[ i ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::Multireduction< DeviceType >::reduce +// ( scalarProduct, +// i, +// size, +// Y.getData(), +// ldSize, +// Traits::getConstLocalView( y_i ).getData(), +// aux ); + const RealType* _Y = Y.getData(); + const RealType* _y_i = Traits::getConstLocalView( y_i ).getData(); + const IndexType ldSize = this->ldSize; + auto fetch = [_Y, _y_i, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _y_i[ idx ]; }; Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i, + ( (RealType) 0, + fetch, + std::plus<>{}, size, - Y.getData(), - ldSize, - Traits::getLocalView( y_i ).getData(), + i, aux ); // no-op if the problem is not distributed CommunicatorType::Allreduce( aux, i, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); @@ -525,14 +535,25 @@ hauseholder_cwy_transposed( VectorViewType z, { // aux = Y_i^T * w RealType aux[ i + 1 ]; - Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::ParallelReductionScalarProduct< RealType, RealType > scalarProduct; +// Containers::Algorithms::Multireduction< DeviceType >::reduce +// ( scalarProduct, +// i + 1, +// size, +// Y.getData(), +// ldSize, +// Traits::getConstLocalView( w ).getData(), +// aux ); + const RealType* _Y = Y.getData(); + const RealType* _w = Traits::getConstLocalView( w ).getData(); + const IndexType ldSize = this->ldSize; + auto fetch = [_Y, _w, ldSize] __cuda_callable__ ( IndexType idx, int k ) { return _Y[ idx + k * ldSize ] * _w[ idx ]; }; Containers::Algorithms::Multireduction< DeviceType >::reduce - ( scalarProduct, - i + 1, + ( (RealType) 0, + fetch, + std::plus<>{}, size, - Y.getData(), - ldSize, - Traits::getConstLocalView( w ).getData(), + i + 1, aux ); // no-op if the problem is not distributed Traits::CommunicatorType::Allreduce( aux, i + 1, MPI_SUM, Traits::getCommunicationGroup( *this->matrix ) ); diff --git a/src/TNL/Solvers/ODE/Euler.hpp b/src/TNL/Solvers/ODE/Euler.hpp index ccc98daa7ec9eacc4409e6de0688c62e89277cdc..12da6439bd15d4fdbe1e0a088910c940cfc90aa2 100644 --- a/src/TNL/Solvers/ODE/Euler.hpp +++ b/src/TNL/Solvers/ODE/Euler.hpp @@ -119,9 +119,7 @@ bool Euler< Problem, SolverMonitor > :: solve( DofVectorPointer& _u ) continue; } } - auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; - this->setResidue( addAndReduceAbs( u, currentTau * k1, reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + this->setResidue( addAndReduceAbs( u, currentTau * k1, std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); /**** * When time is close to stopTime the new residue diff --git a/src/TNL/Solvers/ODE/Merson_impl.h b/src/TNL/Solvers/ODE/Merson_impl.h index ce21298b0c283e0258421b43325c32191a625816..3c88576e9e583c4c161782f630630573bd23be3a 100644 --- a/src/TNL/Solvers/ODE/Merson_impl.h +++ b/src/TNL/Solvers/ODE/Merson_impl.h @@ -173,16 +173,13 @@ bool Merson< Problem, SolverMonitor >::solve( DofVectorPointer& _u ) RealType lastResidue = this->getResidue(); time += currentTau; - auto reduction = [] __cuda_callable__ ( RealType& a , const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a , const volatile RealType& b ) { a += b; }; this->setResidue( addAndReduceAbs( u, currentTau / 6.0 * ( k1 + 4.0 * k4 + k5 ), - reduction, volatileReduction, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); + std::plus<>{}, ( RealType ) 0.0 ) / ( currentTau * ( RealType ) u.getSize() ) ); ///// // When time is close to stopTime the new residue // may be inaccurate significantly. if( abs( time - this->stopTime ) < 1.0e-7 ) this->setResidue( lastResidue ); - if( ! this->nextIteration() ) return false; diff --git a/src/UnitTests/Containers/DistributedVectorTest.h b/src/UnitTests/Containers/DistributedVectorTest.h index e2d07a02aaf2958cc4c2b410c8670a2ac8294650..430a42d5ecb8fee50a5986e20051311f17a30649 100644 --- a/src/UnitTests/Containers/DistributedVectorTest.h +++ b/src/UnitTests/Containers/DistributedVectorTest.h @@ -43,32 +43,26 @@ protected: using VectorViewType = typename DistributedVectorType::LocalViewType; using DistributedVectorView = Containers::DistributedVectorView< RealType, DeviceType, IndexType, CommunicatorType >; - const int globalSize = 97; // prime number to force non-uniform distribution - const typename CommunicatorType::CommunicationGroup group = CommunicatorType::AllGroup; - DistributedVectorType x, y, z; - - DistributedVectorView x_view, y_view, z_view; + DistributedVectorType v; + DistributedVectorView v_view; + typename DistributedVectorType::HostType v_host; const int rank = CommunicatorType::GetRank(group); const int nproc = CommunicatorType::GetSize(group); + // should be small enough to have fast tests, but large enough to test + // prefix-sum with multiple CUDA grids + const int globalSize = 10000 * nproc; + DistributedVectorTest() { using LocalRangeType = typename DistributedVector::LocalRangeType; const LocalRangeType localRange = Partitioner< IndexType, CommunicatorType >::splitRange( globalSize, group ); - x.setDistribution( localRange, globalSize, group ); - y.setDistribution( localRange, globalSize, group ); - z.setDistribution( localRange, globalSize, group ); - - x_view.bind( x ); - y_view.bind( y ); - z_view.bind( z ); - - setConstantSequence( x, 1 ); - setLinearSequence( y ); - setNegativeLinearSequence( z ); + v.setDistribution( localRange, globalSize, group ); + v_view.bind( v ); + setConstantSequence( v, 1 ); } }; @@ -85,7 +79,247 @@ using DistributedVectorTypes = ::testing::Types< TYPED_TEST_SUITE( DistributedVectorTest, DistributedVectorTypes ); -// TODO: distributed prefix sum +#if 1 +TYPED_TEST( DistributedVectorTest, prefixSum ) +{ + using RealType = typename TestFixture::DistributedVectorType::RealType; + using DeviceType = typename TestFixture::DistributedVectorType::DeviceType; + using IndexType = typename TestFixture::DistributedVectorType::IndexType; + + auto& v = this->v; + auto& v_view = this->v_view; + auto& v_host = this->v_host; + const auto localRange = v.getLocalRange(); + + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; + + setConstantSequence( v, 0 ); + v_host = -1; + v.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.prefixSum(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v_view.prefixSum(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + //// + // With CUDA, perform tests with multiple CUDA grids. + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; + + setConstantSequence( v, 0 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ); + + setLinearSequence( v ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ); + + setLinearSequence( v ); + v_host = -1; + v_view.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); +#endif + } +} +#endif + +#if 1 +TYPED_TEST( DistributedVectorTest, exclusivePrefixSum ) +{ + using RealType = typename TestFixture::DistributedVectorType::RealType; + using DeviceType = typename TestFixture::DistributedVectorType::DeviceType; + using IndexType = typename TestFixture::DistributedVectorType::IndexType; + + auto& v = this->v; + auto& v_view = this->v_view; + auto& v_host = this->v_host; + const auto localRange = v.getLocalRange(); + + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; + + setConstantSequence( v, 0 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; + + setLinearSequence( v ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + //// + // With CUDA, perform tests with multiple CUDA grids. + if( std::is_same< DeviceType, Devices::Cuda >::value ) + { +#ifdef HAVE_CUDA + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; + + setConstantSequence( v, 0 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ); + + setLinearSequence( v ); + v_host = -1; + v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + // test views + setConstantSequence( v, 0 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], 0 ); + + setConstantSequence( v, 1 ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], i ); + + setLinearSequence( v ); + v_host = -1; + v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = localRange.getBegin(); i < localRange.getEnd(); i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); +#endif + } +} +#endif #endif // HAVE_GTEST diff --git a/src/UnitTests/Containers/MultireductionTest.h b/src/UnitTests/Containers/MultireductionTest.h index 0487e916b8080bec0ead292ddb95725d8f84533a..7a321f5836cb2e0b2737c6176eb0b23794c4a501 100644 --- a/src/UnitTests/Containers/MultireductionTest.h +++ b/src/UnitTests/Containers/MultireductionTest.h @@ -94,29 +94,45 @@ using VectorTypes = ::testing::Types< TYPED_TEST_SUITE( MultireductionTest, VectorTypes ); -TYPED_TEST( MultireductionTest, scalarProduct ) +// idiot nvcc does not allow __cuda_callable__ lambdas inside private or protected regions +template< typename DeviceVector, typename HostVector > +void test_multireduction( const DeviceVector& V, const DeviceVector& y, HostVector& result ) { - using RealType = typename TestFixture::DeviceVector::RealType; - using DeviceType = typename TestFixture::DeviceVector::DeviceType; + using RealType = typename DeviceVector::RealType; + using DeviceType = typename DeviceVector::DeviceType; + using IndexType = typename DeviceVector::IndexType; + + const RealType* _V = V.getData(); + const RealType* _y = y.getData(); + const IndexType size = y.getSize(); + const int n = result.getSize(); + ASSERT_EQ( V.getSize(), size * n ); - ParallelReductionScalarProduct< RealType, RealType > scalarProduct; + auto fetch = [=] __cuda_callable__ ( IndexType i, int k ) + { + TNL_ASSERT_LT( i, size, "BUG: fetcher got invalid index i" ); + TNL_ASSERT_LT( k, n, "BUG: fetcher got invalid index k" ); + return _V[ i + k * size ] * _y[ i ]; + }; Multireduction< DeviceType >::reduce - ( scalarProduct, - this->n, - this->size, - this->V.getData(), - this->size, - this->y.getData(), - this->result.getData() ); - - for( int i = 0; i < this->n; i++ ) { + ( (RealType) 0, + fetch, + std::plus<>{}, + size, + n, + result.getData() ); + + for( int i = 0; i < n; i++ ) { if( i % 2 == 0 ) - EXPECT_EQ( this->result[ i ], 0.5 * this->size * ( this->size - 1 ) ); + EXPECT_EQ( result[ i ], 0.5 * size * ( size - 1 ) ); else - EXPECT_EQ( this->result[ i ], - 0.5 * this->size * ( this->size - 1 ) ); + EXPECT_EQ( result[ i ], - 0.5 * size * ( size - 1 ) ); } } - +TYPED_TEST( MultireductionTest, scalarProduct ) +{ + test_multireduction( this->V, this->y, this->result ); +} #endif // HAVE_GTEST diff --git a/src/UnitTests/Containers/VectorBinaryOperationsTest.h b/src/UnitTests/Containers/VectorBinaryOperationsTest.h index 8fae768849c26406ecabc08c9fcf261e686d9453..93283483c8a06e89c6e9dfceee81f82e391c5955 100644 --- a/src/UnitTests/Containers/VectorBinaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorBinaryOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace binary_tests { diff --git a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h index d8c26b93c7a9da3a67fcb499f342dafa7f2d269d..8c0cd7f90bf20821043c46f4c6d7a9c74fbb511c 100644 --- a/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h +++ b/src/UnitTests/Containers/VectorEvaluateAndReduceTest.h @@ -22,7 +22,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; using namespace TNL::Arithmetics; // should be small enough to have fast tests, but larger than minGPUReductionDataSize @@ -37,9 +36,7 @@ performEvaluateAndReduce( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return evaluateAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return evaluateAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 ); } TYPED_TEST( VectorTest, evaluateAndReduce ) @@ -74,9 +71,7 @@ performAddAndReduce1( VectorView& u, VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return addAndReduce( w, u * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return addAndReduce( w, u * v, std::plus<>{}, ( RealType ) 0.0 ); } template< typename VectorView > @@ -85,9 +80,7 @@ performAddAndReduce2( VectorView& v, VectorView& w ) { using RealType = typename VectorView::RealType; - auto reduction = [] __cuda_callable__ ( RealType& a, const RealType& b ) { a += b; }; - auto volatileReduction = [] __cuda_callable__ ( volatile RealType& a, volatile RealType& b ) { a += b; }; - return addAndReduce( w, 5.0 * v, reduction, volatileReduction, ( RealType ) 0.0 ); + return addAndReduce( w, 5.0 * v, std::plus<>{}, ( RealType ) 0.0 ); } diff --git a/src/UnitTests/Containers/VectorPrefixSumTest.h b/src/UnitTests/Containers/VectorPrefixSumTest.h index 733013933014f016f5c0faf18dcc8bdea12bc16f..4659d365d87936e595c7462a7e95096410086e6c 100644 --- a/src/UnitTests/Containers/VectorPrefixSumTest.h +++ b/src/UnitTests/Containers/VectorPrefixSumTest.h @@ -20,102 +20,120 @@ constexpr int VECTOR_TEST_SIZE = 10000; TYPED_TEST( VectorTest, prefixSum ) { using VectorType = typename TestFixture::VectorType; - using VectorOperations = typename TestFixture::VectorOperations; using ViewType = typename TestFixture::ViewType; using RealType = typename VectorType::RealType; using DeviceType = typename VectorType::DeviceType; using IndexType = typename VectorType::IndexType; const int size = VECTOR_TEST_SIZE; - if( std::is_same< RealType, float >::value || - std::is_same< IndexType, short >::value ) - return; + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) + return; VectorType v( size ); ViewType v_view( v ); typename VectorType::HostType v_host( size ); - v = 0; + setConstantSequence( v, 0 ); v_host = -1; v.prefixSum(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + v_host = v_view; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.prefixSum(); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.prefixSum(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i + 1 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v = 0; + setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); v_host = v_view; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.prefixSum(); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; //// // With CUDA, perform tests with multiple CUDA grids. if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::setMaxGridSize( 3 ); - v = 0; + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::maxGridSize() = 3; + + setConstantSequence( v, 0 ); v_host = -1; v.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; + + setConstantSequence( v, 1 ); + v_host = -1; + v.prefixSum(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v_view; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); - v_host = v_view; + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i + 1 ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v = 0; + setConstantSequence( v, 1 ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v_view; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i + 1 ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.prefixSum(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount ), 1 ); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i ); - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i + 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Inclusive, RealType, IndexType >::resetMaxGridSize(); #endif } } @@ -123,15 +141,14 @@ TYPED_TEST( VectorTest, prefixSum ) TYPED_TEST( VectorTest, exclusivePrefixSum ) { using VectorType = typename TestFixture::VectorType; - using VectorOperations = typename TestFixture::VectorOperations; using ViewType = typename TestFixture::ViewType; using RealType = typename VectorType::RealType; using DeviceType = typename VectorType::DeviceType; using IndexType = typename VectorType::IndexType; const int size = VECTOR_TEST_SIZE; - if( std::is_same< RealType, float >::value || - std::is_same< IndexType, short >::value ) + // FIXME: tests should work in all cases + if( std::is_same< RealType, float >::value ) return; VectorType v; @@ -139,103 +156,106 @@ TYPED_TEST( VectorTest, exclusivePrefixSum ) ViewType v_view( v ); typename VectorType::HostType v_host( size ); - setConstantSequence( v, 1 ); + setConstantSequence( v, 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; //// // With CUDA, perform tests with multiple CUDA grids. if( std::is_same< DeviceType, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::setMaxGridSize( 3 ); + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::maxGridSize() = 3; - setConstantSequence( v, 1 ); + setConstantSequence( v, 0 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); v_host = v; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; - setConstantSequence( v, 1 ); + // test views + setConstantSequence( v, 0 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); - v_host = v_view; + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], i ); + EXPECT_EQ( v_host[ i ], 0 ) << "i = " << i; - v.setValue( 0 ); + setConstantSequence( v, 1 ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); - v_host = v_view; + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; for( int i = 0; i < size; i++ ) - EXPECT_EQ( v_host[ i ], 0 ); + EXPECT_EQ( v_host[ i ], i ) << "i = " << i; setLinearSequence( v ); v_host = -1; v_view.template prefixSum< Algorithms::PrefixSumType::Exclusive >(); - EXPECT_GT( ( CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount ), 1 ); - v_host = v_view; - for( int i = 1; i < size; i++ ) - EXPECT_EQ( v_host[ i ] - v_host[ i - 1 ], i - 1 ); - CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); + EXPECT_GT( ( Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::gridsCount() ), 1 ); + v_host = v; + for( int i = 0; i < size; i++ ) + EXPECT_EQ( v_host[ i ], (i * (i - 1)) / 2 ) << "i = " << i; + + Algorithms::CudaPrefixSumKernelLauncher< Algorithms::PrefixSumType::Exclusive, RealType, IndexType >::resetMaxGridSize(); #endif } } diff --git a/src/UnitTests/Containers/VectorTestSetup.h b/src/UnitTests/Containers/VectorTestSetup.h index 7338f9637d3178a4d9b567af59e6c201c419f30e..5c342dced87f713824344cb43ee1c9922dbf0ef6 100644 --- a/src/UnitTests/Containers/VectorTestSetup.h +++ b/src/UnitTests/Containers/VectorTestSetup.h @@ -8,8 +8,6 @@ /* See Copyright Notice in tnl/Copyright */ -// NOTE: Vector = Array + VectorOperations, so we test Vector and VectorOperations at the same time - #pragma once #ifdef HAVE_GTEST @@ -24,7 +22,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; using namespace TNL::Arithmetics; // test fixture for typed tests @@ -33,7 +30,6 @@ class VectorTest : public ::testing::Test { protected: using VectorType = Vector; - using VectorOperations = Algorithms::VectorOperations< typename VectorType::DeviceType >; using ViewType = VectorView< typename Vector::RealType, typename Vector::DeviceType, typename Vector::IndexType >; }; diff --git a/src/UnitTests/Containers/VectorUnaryOperationsTest.h b/src/UnitTests/Containers/VectorUnaryOperationsTest.h index a0aa2695f054295ec83ced0e7b9fa2af70249872..1224042532b2c55eb33757e61fe603219840aa61 100644 --- a/src/UnitTests/Containers/VectorUnaryOperationsTest.h +++ b/src/UnitTests/Containers/VectorUnaryOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace unary_tests { diff --git a/src/UnitTests/Containers/VectorVerticalOperationsTest.h b/src/UnitTests/Containers/VectorVerticalOperationsTest.h index 758ee1d50026dd0c23ccbf51f37c268732e5e16b..93ff286ae65fb5b9dc170903d6aeb16032ebaa69 100644 --- a/src/UnitTests/Containers/VectorVerticalOperationsTest.h +++ b/src/UnitTests/Containers/VectorVerticalOperationsTest.h @@ -31,7 +31,6 @@ using namespace TNL; using namespace TNL::Containers; -using namespace TNL::Containers::Algorithms; namespace vertical_tests {