Loading CMakeLists.txt +0 −4 Original line number Diff line number Diff line Loading @@ -409,10 +409,6 @@ endif() # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" ) INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} ) Loading build +0 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" for option in "$@" do Loading Loading @@ -75,7 +74,6 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; * ) echo "Unknown option ${option}. Use --help for more information." exit 1 ;; Loading Loading @@ -175,7 +173,6 @@ cmake_command=( -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} -DINSTANTIATE_INT=${INSTANTIATE_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} ) # Skip running cmake if it was already run and the cmake command is the same. Loading src/Benchmarks/BLAS/vector-operations.h +0 −109 Original line number Diff line number Diff line Loading @@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time( reset1, "CPU (general)", maxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); #endif Loading @@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto minHost = [&]() { resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time( reset1, "CPU (general)", minHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); #endif Loading @@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; Loading @@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); Loading @@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; Loading @@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); Loading @@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time( reset1, "CPU (general)", sumHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); #endif Loading @@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; Loading @@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); Loading @@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; Loading @@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); Loading @@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); #endif Loading @@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; Loading @@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); Loading src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time, cell.getBasis(), gridXIdx, gridYIdx ); cudaThreadSynchronize(); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //std::cerr << "Computing the heat equation ..." << std::endl; Loading @@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time, cell.getBasis(), gridXIdx, gridYIdx ); cudaThreadSynchronize(); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( this->cudaKernelType == "templated" ) Loading src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h +1 −1 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ int main( int argc, char* argv[] ) while( iteration < 10000 ) { testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>(); cudaThreadSynchronize(); cudaDeviceSynchronize(); iteration++; } auto t_stop = std::chrono::high_resolution_clock::now(); Loading Loading
CMakeLists.txt +0 −4 Original line number Diff line number Diff line Loading @@ -409,10 +409,6 @@ endif() # endif() #endif() if( OPTIMIZED_VECTOR_HOST_OPERATIONS STREQUAL "yes" ) AddCompilerFlag( "-DOPTIMIZED_VECTOR_HOST_OPERATIONS " ) endif() CONFIGURE_FILE( "tnlConfig.h.in" "${PROJECT_BUILD_PATH}/TNL/tnlConfig.h" ) INSTALL( FILES ${PROJECT_BUILD_PATH}/TNL/tnlConfig.h DESTINATION ${TNL_TARGET_INCLUDE_DIRECTORY} ) Loading
build +0 −3 Original line number Diff line number Diff line Loading @@ -34,7 +34,6 @@ INSTANTIATE_INT="yes" INSTANTIATE_LONG_DOUBLE="no" INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" OPTIMIZED_VECTOR_HOST_OPERATIONS="no" for option in "$@" do Loading Loading @@ -75,7 +74,6 @@ do INSTANTIATE_DOUBLE="yes" INSTANTIATE_FLOAT="no" WITH_CUDA_ARCH="auto" ;; --optimize-vector-host-operations=* ) OPTIMIZED_VECTOR_HOST_OPERATIONS="yes" ;; * ) echo "Unknown option ${option}. Use --help for more information." exit 1 ;; Loading Loading @@ -175,7 +173,6 @@ cmake_command=( -DINSTANTIATE_LONG_DOUBLE=${INSTANTIATE_LONG_DOUBLE} -DINSTANTIATE_INT=${INSTANTIATE_INT} -DINSTANTIATE_LONG_INT=${INSTANTIATE_LONG_INT} -DOPTIMIZED_VECTOR_HOST_OPERATIONS=${OPTIMIZED_VECTOR_HOST_OPERATIONS} ) # Skip running cmake if it was already run and the cmake command is the same. Loading
src/Benchmarks/BLAS/vector-operations.h +0 −109 Original line number Diff line number Diff line Loading @@ -87,23 +87,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); benchmark.time( reset1, "CPU", maxHost ); benchmark.time( reset1, "CPU (general)", maxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", maxCuda ); #endif Loading @@ -112,23 +100,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto minHost = [&]() { resultHost = hostVector.min(); }; auto minHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); benchmark.time( reset1, "CPU", minHost ); benchmark.time( reset1, "CPU (general)", minHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", minCuda ); #endif Loading @@ -137,17 +113,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMax< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; Loading @@ -162,7 +127,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMax", datasetSize ); benchmark.time( reset1, "CPU", absMaxHost ); benchmark.time( reset1, "CPU (general)", absMaxHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMaxCuda ); benchmark.time( reset1, "cuBLAS", absMaxCublas ); Loading @@ -172,17 +136,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsMin< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; Loading @@ -197,7 +150,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "absMin", datasetSize ); benchmark.time( reset1, "CPU", absMinHost ); benchmark.time( reset1, "CPU (general)", absMinHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", absMinCuda ); benchmark.time( reset1, "cuBLAS", absMinCublas ); Loading @@ -207,23 +159,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); benchmark.time( reset1, "CPU", sumHost ); benchmark.time( reset1, "CPU (general)", sumHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", sumCuda ); #endif Loading @@ -232,17 +172,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionAbsSum< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; Loading @@ -255,7 +184,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l1 norm", datasetSize ); benchmark.time( reset1, "CPU", l1normHost ); benchmark.time( reset1, "CPU (general)", l1normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l1normCuda ); benchmark.time( reset1, "cuBLAS", l1normCublas ); Loading @@ -265,17 +193,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionL2Norm< Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; Loading @@ -288,7 +205,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "l2 norm", datasetSize ); benchmark.time( reset1, "CPU", l2normHost ); benchmark.time( reset1, "CPU (general)", l2normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l2normCuda ); benchmark.time( reset1, "cuBLAS", l2normCublas ); Loading @@ -298,24 +214,11 @@ benchmarkVectorOperations( Benchmark & benchmark, auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionLpNorm< Real > operation; operation.setPower( 3.0 ); Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), ( Real* ) 0, result ); return result; }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); benchmark.time( reset1, "CPU", l3normHost ); benchmark.time( reset1, "CPU (general)", l3normHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", l3normCuda ); #endif Loading @@ -324,17 +227,6 @@ benchmarkVectorOperations( Benchmark & benchmark, auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductHostGeneral = [&]() { Real result( 0 ); Containers::Algorithms::ParallelReductionScalarProduct< Real, Real > operation; Containers::Algorithms::Reduction< Devices::Host >::reduce( operation, hostVector.getSize(), hostVector.getData(), hostVector2.getData(), result ); return result; }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; Loading @@ -348,7 +240,6 @@ benchmarkVectorOperations( Benchmark & benchmark, #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); benchmark.time( reset1, "CPU", scalarProductHost ); benchmark.time( reset1, "CPU (general)", scalarProductHostGeneral ); #ifdef HAVE_CUDA benchmark.time( reset1, "GPU", scalarProductCuda ); benchmark.time( reset1, "cuBLAS", scalarProductCublas ); Loading
src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h +2 −2 Original line number Diff line number Diff line Loading @@ -514,7 +514,7 @@ getExplicitUpdate( const RealType& time, cell.getBasis(), gridXIdx, gridYIdx ); cudaThreadSynchronize(); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; //std::cerr << "Computing the heat equation ..." << std::endl; Loading @@ -534,7 +534,7 @@ getExplicitUpdate( const RealType& time, cell.getBasis(), gridXIdx, gridYIdx ); cudaThreadSynchronize(); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; } if( this->cudaKernelType == "templated" ) Loading
src/Benchmarks/HeatEquation/tnl-benchmark-simple-heat-equation-bug.h +1 −1 Original line number Diff line number Diff line Loading @@ -47,7 +47,7 @@ int main( int argc, char* argv[] ) while( iteration < 10000 ) { testKernel< GridEntity ><<< cudaGridSize, cudaBlockSize >>>(); cudaThreadSynchronize(); cudaDeviceSynchronize(); iteration++; } auto t_stop = std::chrono::high_resolution_clock::now(); Loading