Loading CMakeLists.txt +0 −4 Original line number Diff line number Diff line Loading @@ -409,10 +409,6 @@ endif() # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" ) INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} ) Loading build +0 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" for option in "$@" do Loading Loading @@ -75,7 +74,6 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; * ) echo "Unknown option ${option}. Use --help for more information." exit 1 ;; Loading Loading @@ -175,7 +173,6 @@ cmake_command=( -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} -DINSTANTIATE_INT=${INSTANTIATE_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} ) # Skip running cmake if it was already run and the cmake command is the same. Loading src/Benchmarks/BLAS/vector-operations.h +0 −109 Original line number Diff line number Diff line Loading @@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time( reset1, "CPU (general)", maxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); #endif Loading @@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto minHost = [&]() { resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time( reset1, "CPU (general)", minHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); #endif Loading @@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; Loading @@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); Loading @@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; Loading @@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); Loading @@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time( reset1, "CPU (general)", sumHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); #endif Loading @@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; Loading @@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); Loading @@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; Loading @@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); Loading @@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); #endif Loading @@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; Loading @@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); Loading src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +3 −10 Original line number Diff line number Diff line Loading @@ -183,11 +183,8 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); } template< typename Element, Loading @@ -201,11 +198,9 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); } template< typename Element, Loading @@ -219,11 +214,9 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); } Loading src/TNL/Containers/Algorithms/Reduction.h +6 −9 Original line number Diff line number Diff line Loading @@ -30,12 +30,11 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -43,12 +42,11 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -56,12 +54,11 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; } // namespace Algorithms Loading Loading
CMakeLists.txt +0 −4 Original line number Diff line number Diff line Loading @@ -409,10 +409,6 @@ endif() # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" ) INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} ) Loading
build +0 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" for option in "$@" do Loading Loading @@ -75,7 +74,6 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; * ) echo "Unknown option ${option}. Use --help for more information." exit 1 ;; Loading Loading @@ -175,7 +173,6 @@ cmake_command=( -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} -DINSTANTIATE_INT=${INSTANTIATE_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} ) # Skip running cmake if it was already run and the cmake command is the same. Loading
src/Benchmarks/BLAS/vector-operations.h +0 −109 Original line number Diff line number Diff line Loading @@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time( reset1, "CPU (general)", maxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); #endif Loading @@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto minHost = [&]() { resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time( reset1, "CPU (general)", minHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); #endif Loading @@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; Loading @@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); Loading @@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; Loading @@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); Loading @@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time( reset1, "CPU (general)", sumHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); #endif Loading @@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; Loading @@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); Loading @@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; Loading @@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); Loading @@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); #endif Loading @@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; Loading @@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); Loading
src/TNL/Containers/Algorithms/ArrayOperationsCuda_impl.h +3 −10 Original line number Diff line number Diff line Loading @@ -183,11 +183,8 @@ compareMemory( const Element1* destination, { TNL_ASSERT_TRUE( destination, "Attempted to compare data through a nullptr." ); TNL_ASSERT_TRUE( source, "Attempted to compare data through a nullptr." ); //TODO: The parallel reduction on the CUDA device with different element types is needed. bool result = false; Algorithms::ParallelReductionEqualities< Element1, Element2 > reductionEqualities; Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionEqualities, size, destination, source ); } template< typename Element, Loading @@ -201,11 +198,9 @@ containsValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsValue< Element > reductionContainsValue; reductionContainsValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsValue, size, data, nullptr ); } template< typename Element, Loading @@ -219,11 +214,9 @@ containsOnlyValue( const Element* data, TNL_ASSERT_TRUE( data, "Attempted to check data through a nullptr." ); TNL_ASSERT_GE( size, 0, "" ); if( size == 0 ) return false; bool result = false; Algorithms::ParallelReductionContainsOnlyValue< Element > reductionContainsOnlyValue; reductionContainsOnlyValue.setValue( value ); Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, 0, result ); return result; return Reduction< Devices::Cuda >::reduce( reductionContainsOnlyValue, size, data, nullptr ); } Loading
src/TNL/Containers/Algorithms/Reduction.h +6 −9 Original line number Diff line number Diff line Loading @@ -30,12 +30,11 @@ class Reduction< Devices::Cuda > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -43,12 +42,11 @@ class Reduction< Devices::Host > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; template<> Loading @@ -56,12 +54,11 @@ class Reduction< Devices::MIC > { public: template< typename Operation, typename Index > static void static typename Operation::ResultType reduce( Operation& operation, const Index size, const typename Operation::DataType1* deviceInput1, const typename Operation::DataType2* deviceInput2, typename Operation::ResultType& result ); const typename Operation::DataType2* deviceInput2 ); }; } // namespace Algorithms Loading