Loading src/Benchmarks/Benchmarks.h +28 −14 Original line number Diff line number Diff line Loading @@ -202,33 +202,35 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits<double>::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) if( this->reset ) result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) if( this->reset ) result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } this->performedLoops = functionTimer.getPerformedLoops(); } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; Loading Loading @@ -269,24 +271,25 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits<double>::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; } result.bandwidth = datasetSize / result.time; Loading Loading @@ -320,6 +323,7 @@ public: // each computation has 3 subcolumns const int colspan = 3 * numberOfComputations; writeErrorMessage( msg, colspan ); std::cerr << msg << std::endl; } using Logging::save; Loading @@ -330,8 +334,18 @@ public: return monitor; } int getPerformedLoops() const { return this->performedLoops; } bool isResetingOn() const { return reset; } protected: int loops = 1; int loops = 1, performedLoops = 0; double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; Loading src/Benchmarks/FunctionTimer.h +17 −10 Original line number Diff line number Diff line Loading @@ -22,17 +22,17 @@ namespace TNL { namespace Benchmarks { template< typename Device, bool timing > template< typename Device > class FunctionTimer { public: using DeviceType = Device; template< typename ComputeFunction, template< bool timing, typename ComputeFunction, typename ResetFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > static double double timeFunction( ComputeFunction compute, ResetFunction reset, int maxLoops, Loading @@ -52,7 +52,6 @@ class FunctionTimer reset(); compute(); int loops; // If we do not perform reset function and don't need // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) Loading Loading @@ -85,7 +84,6 @@ class FunctionTimer { // abuse the monitor's "time" for loops monitor.setTime( loops + 1 ); reset(); // Explicit synchronization of the CUDA device Loading @@ -104,15 +102,17 @@ class FunctionTimer timer.stop(); } } std::cerr << loops << std::endl; if( timing ) return timer.getRealTime() / ( double ) loops; else return std::numeric_limits<double>::quiet_NaN(); } template< typename ComputeFunction, template< bool timing, typename ComputeFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > static double double timeFunction( ComputeFunction compute, int maxLoops, const double& minTime, Loading @@ -120,8 +120,15 @@ class FunctionTimer Monitor && monitor = Monitor() ) { auto noReset = [] () {}; return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } int getPerformedLoops() const { return this->loops; } protected: int loops; }; } // namespace Benchmarks Loading src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +9 −1 Original line number Diff line number Diff line Loading @@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > userData( this->u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading Loading @@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > size );*/ } bool checkAddOne( int loops, bool reseting ) { std::cout << loops << " -> " << v << std::endl; if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +9 −2 Original line number Diff line number Diff line Loading @@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > userData( u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading @@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; dim3 blockSize( 16, 16 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, Loading Loading @@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > }*/ } bool checkAddOne( int loops, bool reseting ) { if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +9 −3 Original line number Diff line number Diff line Loading @@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > userData( u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading @@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, Loading Loading @@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } void addOneUsingTraverser() { traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } bool checkAddOne( int loops, bool reseting ) { if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading Loading
src/Benchmarks/Benchmarks.h +28 −14 Original line number Diff line number Diff line Loading @@ -202,33 +202,35 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits<double>::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) if( this->reset ) result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) if( this->reset ) result.time = FunctionTimer< Device, true >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else if( this->reset ) result.time = FunctionTimer< Device, false >::timeFunction( compute, reset, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, reset, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } this->performedLoops = functionTimer.getPerformedLoops(); } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; Loading Loading @@ -269,24 +271,25 @@ public: BenchmarkResult & result ) { result.time = std::numeric_limits<double>::quiet_NaN(); FunctionTimer< Device > functionTimer; try { if( verbose > 1 ) { // run the monitor main loop Solvers::SolverMonitorThread monitor_thread( monitor ); if( this->timing ) result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } else { if( this->timing ) result.time = FunctionTimer< Device, true >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< true >( compute, loops, minTime, verbose, monitor ); else result.time = FunctionTimer< Device, false >::timeFunction( compute, loops, minTime, verbose, monitor ); result.time = functionTimer. template timeFunction< false >( compute, loops, minTime, verbose, monitor ); } } catch ( const std::exception& e ) { std::cerr << "timeFunction failed due to a C++ exception with description: " << e.what() << std::endl; std::cerr << "Function timer failed due to a C++ exception with description: " << e.what() << std::endl; } result.bandwidth = datasetSize / result.time; Loading Loading @@ -320,6 +323,7 @@ public: // each computation has 3 subcolumns const int colspan = 3 * numberOfComputations; writeErrorMessage( msg, colspan ); std::cerr << msg << std::endl; } using Logging::save; Loading @@ -330,8 +334,18 @@ public: return monitor; } int getPerformedLoops() const { return this->performedLoops; } bool isResetingOn() const { return reset; } protected: int loops = 1; int loops = 1, performedLoops = 0; double minTime = 0.0; double datasetSize = 0.0; double baseTime = 0.0; Loading
src/Benchmarks/FunctionTimer.h +17 −10 Original line number Diff line number Diff line Loading @@ -22,17 +22,17 @@ namespace TNL { namespace Benchmarks { template< typename Device, bool timing > template< typename Device > class FunctionTimer { public: using DeviceType = Device; template< typename ComputeFunction, template< bool timing, typename ComputeFunction, typename ResetFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > static double double timeFunction( ComputeFunction compute, ResetFunction reset, int maxLoops, Loading @@ -52,7 +52,6 @@ class FunctionTimer reset(); compute(); int loops; // If we do not perform reset function and don't need // the monitor, the timer is not interrupted after each loop. if( ! performReset && verbose < 2 ) Loading Loading @@ -85,7 +84,6 @@ class FunctionTimer { // abuse the monitor's "time" for loops monitor.setTime( loops + 1 ); reset(); // Explicit synchronization of the CUDA device Loading @@ -104,15 +102,17 @@ class FunctionTimer timer.stop(); } } std::cerr << loops << std::endl; if( timing ) return timer.getRealTime() / ( double ) loops; else return std::numeric_limits<double>::quiet_NaN(); } template< typename ComputeFunction, template< bool timing, typename ComputeFunction, typename Monitor = TNL::Solvers::IterativeSolverMonitor< double, int > > static double double timeFunction( ComputeFunction compute, int maxLoops, const double& minTime, Loading @@ -120,8 +120,15 @@ class FunctionTimer Monitor && monitor = Monitor() ) { auto noReset = [] () {}; return timeFunction( compute, noReset, maxLoops, minTime, verbose, monitor, false ); return timeFunction< timing >( compute, noReset, maxLoops, minTime, verbose, monitor, false ); } int getPerformedLoops() const { return this->loops; } protected: int loops; }; } // namespace Benchmarks Loading
src/Benchmarks/Traversers/GridTraversersBenchmark_1D.h +9 −1 Original line number Diff line number Diff line Loading @@ -54,12 +54,12 @@ class GridTraversersBenchmark< 1, Device, Real, Index > userData( this->u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading Loading @@ -146,6 +146,14 @@ class GridTraversersBenchmark< 1, Device, Real, Index > size );*/ } bool checkAddOne( int loops, bool reseting ) { std::cout << loops << " -> " << v << std::endl; if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading
src/Benchmarks/Traversers/GridTraversersBenchmark_2D.h +9 −2 Original line number Diff line number Diff line Loading @@ -52,12 +52,12 @@ class GridTraversersBenchmark< 2, Device, Real, Index > userData( u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading @@ -71,7 +71,7 @@ class GridTraversersBenchmark< 2, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; dim3 blockSize( 16, 16 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, Loading Loading @@ -183,6 +183,13 @@ class GridTraversersBenchmark< 2, Device, Real, Index > }*/ } bool checkAddOne( int loops, bool reseting ) { if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading
src/Benchmarks/Traversers/GridTraversersBenchmark_3D.h +9 −3 Original line number Diff line number Diff line Loading @@ -58,12 +58,12 @@ class GridTraversersBenchmark< 3, Device, Real, Index > userData( u ) { v_data = v.getData(); u->getData().bind( v ); } void reset() { v.setValue( 0.0 ); u->getData().setValue( 0.0 ); }; void addOneUsingPureC() Loading @@ -78,7 +78,7 @@ class GridTraversersBenchmark< 3, Device, Real, Index > else // Device == Devices::Cuda { #ifdef HAVE_CUDA dim3 blockSize( 256 ), blocksCount, gridsCount; dim3 blockSize( 32, 4, 2 ), blocksCount, gridsCount; Devices::Cuda::setupThreads( blockSize, blocksCount, Loading Loading @@ -174,13 +174,19 @@ class GridTraversersBenchmark< 3, Device, Real, Index > f, v.getData() ); } void addOneUsingTraverser() { traverser.template processAllEntities< UserDataType, AddOneEntitiesProcessorType > ( grid, userData ); } bool checkAddOne( int loops, bool reseting ) { if( reseting ) return v.containsOnlyValue( 1.0 ); return v.containsOnlyValue( ( Real ) loops ); } void traverseUsingPureC() { if( std::is_same< Device, Devices::Host >::value ) Loading