diff --git a/src/Benchmarks/BLAS/array-operations.h b/src/Benchmarks/BLAS/array-operations.h index 9ee6ff8a0699bbc96c63e34942f9ee890d88685f..b5cf9ff58da754aaa4c8ce645989afbb25e61f7c 100644 --- a/src/Benchmarks/BLAS/array-operations.h +++ b/src/Benchmarks/BLAS/array-operations.h @@ -72,9 +72,9 @@ benchmarkArrayOperations( Benchmark & benchmark, resultDevice = (int) deviceArray == deviceArray2; }; benchmark.setOperation( "comparison (operator==)", 2 * datasetSize ); - benchmark.time( reset1, "CPU", compareHost ); + benchmark.time< Devices::Host >( reset1, "CPU", compareHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", compareCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", compareCuda ); #endif @@ -87,9 +87,9 @@ benchmarkArrayOperations( Benchmark & benchmark, benchmark.setOperation( "copy (operator=)", 2 * datasetSize ); // copyBasetime is used later inside HAVE_CUDA guard, so the compiler will // complain when compiling without CUDA - const double copyBasetime = benchmark.time( reset1, "CPU", copyAssignHostHost ); + const double copyBasetime = benchmark.time< Devices::Host >( reset1, "CPU", copyAssignHostHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", copyAssignCudaCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", copyAssignCudaCuda ); #endif @@ -101,8 +101,8 @@ benchmarkArrayOperations( Benchmark & benchmark, }; #ifdef HAVE_CUDA benchmark.setOperation( "copy (operator=)", datasetSize, copyBasetime ); - benchmark.time( reset1, "CPU->GPU", copyAssignHostCuda ); - benchmark.time( reset1, "GPU->CPU", copyAssignCudaHost ); + benchmark.time< Devices::Cuda >( reset1, "CPU->GPU", copyAssignHostCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU->CPU", copyAssignCudaHost ); #endif @@ -113,9 +113,9 @@ benchmarkArrayOperations( Benchmark & benchmark, deviceArray.setValue( 3.0 ); }; benchmark.setOperation( "setValue", datasetSize ); - benchmark.time( reset1, "CPU", setValueHost ); + benchmark.time< Devices::Host >( reset1, "CPU", setValueHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", setValueCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", setValueCuda ); #endif @@ -132,9 +132,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "allocation (setSize)", datasetSize ); - benchmark.time( resetSize1, "CPU", setSizeHost ); + benchmark.time< Devices::Host >( resetSize1, "CPU", setSizeHost ); #ifdef HAVE_CUDA - benchmark.time( resetSize1, "GPU", setSizeCuda ); + benchmark.time< Devices::Cuda >( resetSize1, "GPU", setSizeCuda ); #endif @@ -151,9 +151,9 @@ benchmarkArrayOperations( Benchmark & benchmark, #endif }; benchmark.setOperation( "deallocation (reset)", datasetSize ); - benchmark.time( setSize1, "CPU", resetSizeHost ); + benchmark.time< Devices::Host >( setSize1, "CPU", resetSizeHost ); #ifdef HAVE_CUDA - benchmark.time( setSize1, "GPU", resetSizeCuda ); + benchmark.time< Devices::Cuda >( setSize1, "GPU", resetSizeCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 9df40f4ec6b0db6d8d268dd5d38cef92b9fc61c8..7299f828a51f11ed5cab5ecd178e0e78040fcc90 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -163,9 +163,9 @@ benchmarkSpMV( Benchmark & benchmark, }; benchmark.setOperation( datasetSize ); - benchmark.time( reset, "CPU", spmvHost ); + benchmark.time< Devices::Host >( reset, "CPU", spmvHost ); #ifdef HAVE_CUDA - benchmark.time( reset, "GPU", spmvCuda ); + benchmark.time< Devices::Cuda >( reset, "GPU", spmvCuda ); #endif return true; diff --git a/src/Benchmarks/BLAS/vector-operations.h b/src/Benchmarks/BLAS/vector-operations.h index 8dd63de85b985f72a9a1dc3888f2e587234340be..e191b8fbb9a7949b7aba399591c5b2928909d893 100644 --- a/src/Benchmarks/BLAS/vector-operations.h +++ b/src/Benchmarks/BLAS/vector-operations.h @@ -90,9 +90,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.max(); }; benchmark.setOperation( "max", datasetSize ); - benchmark.time( reset1, "CPU", maxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", maxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", maxCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", maxCuda ); #endif @@ -103,9 +103,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.min(); }; benchmark.setOperation( "min", datasetSize ); - benchmark.time( reset1, "CPU", minHost ); + benchmark.time< Devices::Host >( reset1, "CPU", minHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", minCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", minCuda ); #endif @@ -125,10 +125,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMax", datasetSize ); - benchmark.time( reset1, "CPU", absMaxHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMaxHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMaxCuda ); - benchmark.time( reset1, "cuBLAS", absMaxCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMaxCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMaxCublas ); #endif @@ -148,10 +148,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "absMin", datasetSize ); - benchmark.time( reset1, "CPU", absMinHost ); + benchmark.time< Devices::Host >( reset1, "CPU", absMinHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", absMinCuda ); - benchmark.time( reset1, "cuBLAS", absMinCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", absMinCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", absMinCublas ); #endif @@ -162,9 +162,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.sum(); }; benchmark.setOperation( "sum", datasetSize ); - benchmark.time( reset1, "CPU", sumHost ); + benchmark.time< Devices::Host >( reset1, "CPU", sumHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", sumCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", sumCuda ); #endif @@ -182,10 +182,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l1 norm", datasetSize ); - benchmark.time( reset1, "CPU", l1normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l1normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l1normCuda ); - benchmark.time( reset1, "cuBLAS", l1normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l1normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l1normCublas ); #endif @@ -203,10 +203,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "l2 norm", datasetSize ); - benchmark.time( reset1, "CPU", l2normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l2normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l2normCuda ); - benchmark.time( reset1, "cuBLAS", l2normCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l2normCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", l2normCublas ); #endif @@ -217,9 +217,9 @@ benchmarkVectorOperations( Benchmark & benchmark, resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmark.setOperation( "l3 norm", datasetSize ); - benchmark.time( reset1, "CPU", l3normHost ); + benchmark.time< Devices::Host >( reset1, "CPU", l3normHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", l3normCuda ); + benchmark.time< Devices::Cuda >( reset1, "GPU", l3normCuda ); #endif @@ -238,10 +238,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar product", 2 * datasetSize ); - benchmark.time( reset1, "CPU", scalarProductHost ); + benchmark.time< Devices::Host >( reset1, "CPU", scalarProductHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", scalarProductCuda ); - benchmark.time( reset1, "cuBLAS", scalarProductCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", scalarProductCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", scalarProductCublas ); #endif /* @@ -289,10 +289,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "scalar multiplication", 2 * datasetSize ); - benchmark.time( reset1, "CPU", multiplyHost ); + benchmark.time< Devices::Host >( reset1, "CPU", multiplyHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", multiplyCuda ); - benchmark.time( reset1, "cuBLAS", multiplyCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", multiplyCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", multiplyCublas ); #endif @@ -312,10 +312,10 @@ benchmarkVectorOperations( Benchmark & benchmark, }; #endif benchmark.setOperation( "vector addition", 3 * datasetSize ); - benchmark.time( reset1, "CPU", addVectorHost ); + benchmark.time< Devices::Host >( reset1, "CPU", addVectorHost ); #ifdef HAVE_CUDA - benchmark.time( reset1, "GPU", addVectorCuda ); - benchmark.time( reset1, "cuBLAS", addVectorCublas ); + benchmark.time< Devices::Cuda >( reset1, "GPU", addVectorCuda ); + benchmark.time< Devices::Cuda >( reset1, "cuBLAS", addVectorCublas ); #endif diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index c371e2dfb3ec1bba387bddbdb454abbaaab44f6e..435e7037379be182844dfe39b672bc39a258af68 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -34,53 +34,60 @@ namespace Benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; -template< typename ComputeFunction, - typename ResetFunction, - typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > -double -timeFunction( ComputeFunction compute, - ResetFunction reset, - int loops, - const double& minTime, - Monitor && monitor = Monitor() ) +template< typename Device > +class FunctionTimer { - // the timer is constructed zero-initialized and stopped - Timer timer; - - // set timer to the monitor - monitor.setTimer( timer ); - - // warm up - reset(); - compute(); - - //timer.start(); - int i; - for( i = 0; - i < loops || timer.getRealTime() < minTime; - ++i) - { - // abuse the monitor's "time" for loops - monitor.setTime( i + 1 ); - - reset(); - - // Explicit synchronization of the CUDA device - // TODO: not necessary for host computations -#ifdef HAVE_CUDA - cudaDeviceSynchronize(); + public: + using DeviceType = Device; + + template< typename ComputeFunction, + typename ResetFunction, + typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > + static double + timeFunction( ComputeFunction compute, + ResetFunction reset, + int loops, + const double& minTime, + Monitor && monitor = Monitor() ) + { + // the timer is constructed zero-initialized and stopped + Timer timer; + + // set timer to the monitor + monitor.setTimer( timer ); + + // warm up + reset(); + compute(); + + //timer.start(); + int i; + for( i = 0; + i < loops || timer.getRealTime() < minTime; + ++i) + { + // abuse the monitor's "time" for loops + monitor.setTime( i + 1 ); + + reset(); + + // Explicit synchronization of the CUDA device +#ifdef HAVE_CUDA + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.start(); - compute(); + timer.start(); + compute(); #ifdef HAVE_CUDA - cudaDeviceSynchronize(); + if( std::is_same< Device, Devices::Cuda >::value ) + cudaDeviceSynchronize(); #endif - timer.stop(); - } - - return timer.getRealTime() / ( double ) i; -} + timer.stop(); + } + return timer.getRealTime() / ( double ) i; + } +}; class Logging { @@ -443,7 +450,8 @@ public: // "speedup" columns. // TODO: allow custom columns bound to lambda functions (e.g. for Gflops calculation) // Also terminates the recursion of the following variadic template. - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction > double time( ResetFunction reset, @@ -456,10 +464,10 @@ public: if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } else { - result.time = timeFunction( compute, reset, loops, minTime, monitor ); + result.time = FunctionTimer< Device >::timeFunction( compute, reset, loops, minTime, monitor ); } } catch ( const std::exception& e ) { @@ -477,7 +485,8 @@ public: return this->baseTime; } - template< typename ResetFunction, + template< typename Device, + typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline double @@ -486,7 +495,7 @@ public: ComputeFunction & compute ) { BenchmarkResult result; - return time( reset, performer, compute, result ); + return time< Device, ResetFunction, ComputeFunction >( reset, performer, compute, result ); } // Adds an error message to the log. Should be called in places where the diff --git a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h index a3bd76753aaab02367e5f509cead9dacee2f1b62..55c6bc1567aafc3196dfe9da1cc3e92803ec65b7 100644 --- a/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h +++ b/src/Benchmarks/DistSpMV/tnl-benchmark-distributed-spmv.h @@ -62,7 +62,7 @@ benchmarkSpmv( Benchmark& benchmark, matrix.vectorProduct( x, y ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > @@ -114,7 +114,7 @@ benchmarkDistributedSpmv( Benchmark& benchmark, Matrix::CommunicatorType::Barrier( matrix.getCommunicationGroup() ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< typename Matrix, typename Vector > diff --git a/src/Benchmarks/LinearSolvers/benchmarks.h b/src/Benchmarks/LinearSolvers/benchmarks.h index a82ec2dc297ba008d45b5291ece3df07d47fefe7..c6278a76b1e10de7c830a6bae27476623796e42b 100644 --- a/src/Benchmarks/LinearSolvers/benchmarks.h +++ b/src/Benchmarks/LinearSolvers/benchmarks.h @@ -73,7 +73,7 @@ benchmarkPreconditionerUpdate( Benchmark& benchmark, barrier( matrix ); }; - benchmark.time( reset, performer, compute ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute ); } template< template<typename> class Solver, template<typename> class Preconditioner, typename Matrix, typename Vector > @@ -166,7 +166,7 @@ benchmarkSolver( Benchmark& benchmark, }; MyBenchmarkResult benchmarkResult( solver, matrix, x, b ); - benchmark.time( reset, performer, compute, benchmarkResult ); + benchmark.time< typename Matrix::DeviceType >( reset, performer, compute, benchmarkResult ); } #ifdef HAVE_ARMADILLO diff --git a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h index f1c4efeed635ea1566d8b087b564168603ae1670..9e80b0d062f47424e293cfa81f1646384b5ad9eb 100644 --- a/src/Benchmarks/Traversers/tnl-benchmark-traversers.h +++ b/src/Benchmarks/Traversers/tnl-benchmark-traversers.h @@ -41,7 +41,7 @@ bool runBenchmark( const Config::ParameterContainer& parameters, const int maxSize = parameters.getParameter< int >( "max-size" ); // Full grid traversing - benchmark.newBenchmark( String("Full grid traversing - write 1" + convertToString( Dimension ) + "D" ), metadata ); + benchmark.newBenchmark( String("Full grid traversing - write 1 " + convertToString( Dimension ) + "D" ), metadata ); for( std::size_t size = minSize; size <= maxSize; size *= 2 ) { @@ -78,9 +78,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "Pure C", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingPureC ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingPureC ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingPureC ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingPureC ); #endif /**** @@ -97,9 +97,9 @@ bool runBenchmark( const Config::ParameterContainer& parameters, }; benchmark.setOperation( "parallel for", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingParallelFor ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingParallelFor ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingParallelFor ); #endif /**** @@ -113,12 +113,12 @@ bool runBenchmark( const Config::ParameterContainer& parameters, auto cudaWriteOneUsingTraverser = [&] () { cudaTraverserBenchmark.writeOneUsingTraverser(); - } + }; benchmark.setOperation( "traverser", pow( ( double ) size, ( double ) Dimension ) * sizeof( Real ) / oneGB ); - benchmark.time( hostReset, "CPU", hostWriteOneUsingTraverser ); + benchmark.time< Devices::Host >( hostReset, "CPU", hostWriteOneUsingTraverser ); #ifdef HAVE_CUDA - benchmark.time( cudaReset, "GPU", cudaWriteOneUsingTraverser ); + benchmark.time< Devices::Cuda >( cudaReset, "GPU", cudaWriteOneUsingTraverser ); #endif } return true;