Loading src/Benchmarks/BLAS/vector-operations.h +45 −72 Original line number Diff line number Diff line Loading @@ -88,15 +88,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); Loading @@ -113,15 +110,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto minCuda = [&]() { resultDevice = deviceVector.min(); Loading @@ -138,15 +132,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); Loading @@ -173,15 +164,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); Loading @@ -208,15 +196,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); Loading @@ -233,15 +218,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); Loading @@ -266,15 +248,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); Loading @@ -299,16 +278,13 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); Loading @@ -325,15 +301,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; hostVector2.getData() ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +3 −10 Original line number Diff line number Diff line Loading @@ -183,11 +183,8 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); } template< typename Element, Loading @@ -201,11 +198,9 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); } template< typename Element, Loading @@ -219,11 +214,9 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); } Loading src/TNL/Containers/Algorithms/Reduction.h +6 −9 Original line number Diff line number Diff line Loading @@ -30,12 +30,11 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -43,12 +42,11 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -56,12 +54,11 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; } // namespace Algorithms Loading src/TNL/Containers/Algorithms/Reduction_impl.h +42 −45 Original line number Diff line number Diff line Loading @@ -39,13 +39,12 @@ namespace Algorithms { static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > void typename Operation::ResultType Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ) const typename Operation::DataType2* deviceInput2 ) { #ifdef HAVE_CUDA Loading Loading @@ -75,12 +74,11 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); } else { Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -123,12 +121,14 @@ reduce( Operation& operation, * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr, result ); const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif return result; } else { /*** Loading @@ -152,28 +152,27 @@ reduce( Operation& operation, ResultType resultArray[ 1 ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); result = resultArray[ 0 ]; const ResultType result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } TNL_CHECK_CUDA_DEVICE; return result; } #else throw Exceptions::CudaSupportMissing(); #endif }; template< typename Operation, typename Index > void typename Operation::ResultType Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2, typename Operation::ResultType& result ) const typename Operation::DataType2* input2 ) { typedef Index IndexType; typedef typename Operation::DataType1 DataType1; Loading @@ -182,17 +181,13 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP constexpr int block_size = 128; if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); #pragma omp parallel { const int blocks = size / block_size; // first thread initializes the global result variable #pragma omp single nowait { result = operation.initialValue(); } // initialize thread-local result variable ResultType r = operation.initialValue(); Loading @@ -215,12 +210,14 @@ reduce( Operation& operation, { operation.commonReduction( result, r ); } return result; } else { #endif result = operation.initialValue(); ResultType result = operation.initialValue(); for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); return result; #ifdef HAVE_OPENMP } #endif Loading src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +74 −121 File changed.Preview size limit exceeded, changes collapsed. Show changes Loading
src/Benchmarks/BLAS/vector-operations.h +45 −72 Original line number Diff line number Diff line Loading @@ -88,15 +88,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); Loading @@ -113,15 +110,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto minCuda = [&]() { resultDevice = deviceVector.min(); Loading @@ -138,15 +132,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); Loading @@ -173,15 +164,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); Loading @@ -208,15 +196,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); Loading @@ -233,15 +218,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); Loading @@ -266,15 +248,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); Loading @@ -299,16 +278,13 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; ( Real* ) 0 ); }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); Loading @@ -325,15 +301,12 @@ benchmarkVectorOperations( Benchmark & benchmark, resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( return Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; hostVector2.getData() ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +3 −10 Original line number Diff line number Diff line Loading @@ -183,11 +183,8 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); } template< typename Element, Loading @@ -201,11 +198,9 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); } template< typename Element, Loading @@ -219,11 +214,9 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); } Loading
src/TNL/Containers/Algorithms/Reduction.h +6 −9 Original line number Diff line number Diff line Loading @@ -30,12 +30,11 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -43,12 +42,11 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -56,12 +54,11 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; } // namespace Algorithms Loading
src/TNL/Containers/Algorithms/Reduction_impl.h +42 −45 Original line number Diff line number Diff line Loading @@ -39,13 +39,12 @@ namespace Algorithms { static constexpr int Reduction_minGpuDataSize = 256;//65536; //16384;//1024;//256; template< typename Operation, typename Index > void typename Operation::ResultType Reduction< Devices::Cuda >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ) const typename Operation::DataType2* deviceInput2 ) { #ifdef HAVE_CUDA Loading Loading @@ -75,12 +74,11 @@ reduce( Operation& operation, using _DT2 = typename std::conditional< std::is_same< DataType2, void >::value, DataType1, DataType2 >::type; typename std::remove_const< _DT2 >::type hostArray2[ Reduction_minGpuDataSize ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( hostArray2, (_DT2*) deviceInput2, size ); Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2, result ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, hostArray2 ); } else { Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr, result ); return Reduction< Devices::Host >::reduce( operation, size, hostArray1, (DataType2*) nullptr ); } return; } #ifdef CUDA_REDUCTION_PROFILING Loading Loading @@ -123,12 +121,14 @@ reduce( Operation& operation, * Reduce the data on the host system. */ LaterReductionOperation laterReductionOperation; Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr, result ); const ResultType result = Reduction< Devices::Host >::reduce( laterReductionOperation, reducedSize, resultArray, (void*) nullptr ); #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Reduction of small data set on CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif return result; } else { /*** Loading @@ -152,28 +152,27 @@ reduce( Operation& operation, ResultType resultArray[ 1 ]; ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory( resultArray, deviceAux1, reducedSize ); result = resultArray[ 0 ]; const ResultType result = resultArray[ 0 ]; #ifdef CUDA_REDUCTION_PROFILING timer.stop(); std::cout << " Transferring the result to CPU took " << timer.getRealTime() << " sec. " << std::endl; #endif } TNL_CHECK_CUDA_DEVICE; return result; } #else throw Exceptions::CudaSupportMissing(); #endif }; template< typename Operation, typename Index > void typename Operation::ResultType Reduction< Devices::Host >:: reduce( Operation& operation, const Index size, const typename Operation::DataType1* input1, const typename Operation::DataType2* input2, typename Operation::ResultType& result ) const typename Operation::DataType2* input2 ) { typedef Index IndexType; typedef typename Operation::DataType1 DataType1; Loading @@ -182,17 +181,13 @@ reduce( Operation& operation, #ifdef HAVE_OPENMP constexpr int block_size = 128; if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) if( TNL::Devices::Host::isOMPEnabled() && size >= 2 * block_size ) { // global result variable ResultType result = operation.initialValue(); #pragma omp parallel { const int blocks = size / block_size; // first thread initializes the global result variable #pragma omp single nowait { result = operation.initialValue(); } // initialize thread-local result variable ResultType r = operation.initialValue(); Loading @@ -215,12 +210,14 @@ reduce( Operation& operation, { operation.commonReduction( result, r ); } return result; } else { #endif result = operation.initialValue(); ResultType result = operation.initialValue(); for( IndexType i = 0; i < size; i++ ) operation.firstReduction( result, i, input1, input2 ); return result; #ifdef HAVE_OPENMP } #endif Loading
src/TNL/Containers/Algorithms/VectorOperationsCuda_impl.h +74 −121 File changed.Preview size limit exceeded, changes collapsed. Show changes