diff --git a/tests/benchmarks/benchmarks.h b/tests/benchmarks/benchmarks.h index b61b027400264af2897662e5a040f5bd300291aa..5a8e488e8dd0588c3f4f7038e083a28a1cfb5322 100644 --- a/tests/benchmarks/benchmarks.h +++ b/tests/benchmarks/benchmarks.h @@ -9,6 +9,8 @@ namespace tnl namespace benchmarks { +const double oneGB = 1024.0 * 1024.0 * 1024.0; + // TODO: add data member for error message struct BenchmarkError {}; diff --git a/tests/benchmarks/tnl-cuda-benchmarks.h b/tests/benchmarks/tnl-cuda-benchmarks.h index 9699ac029c7e306c1b6d46b010658c8c5467dab1..9809911cedcb50042d19f7c29a2ab8d577048566 100644 --- a/tests/benchmarks/tnl-cuda-benchmarks.h +++ b/tests/benchmarks/tnl-cuda-benchmarks.h @@ -19,13 +19,12 @@ #define TNLCUDBENCHMARKS_H_ #include <tnlConfig.h> -#include <core/vectors/tnlVector.h> #include <core/tnlList.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #include <matrices/tnlCSRMatrix.h> -#include "benchmarks.h" +#include "vector-operations.h" using namespace tnl::benchmarks; @@ -37,7 +36,6 @@ using namespace tnl::benchmarks; template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; -const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: @@ -199,9 +197,6 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA typedef double Real; - typedef tnlVector< Real, tnlHost > HostVector; - typedef tnlVector< Real, tnlCuda > CudaVector; - /**** * The first argument of this program is the size od data set to be reduced. @@ -217,139 +212,7 @@ int main( int argc, char* argv[] ) if( argc > 3 ) elementsPerRow = atoi( argv[ 3 ] ); - - double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; - - HostVector hostVector, hostVector2; - CudaVector deviceVector, deviceVector2; - hostVector.setSize( size ); - if( ! deviceVector.setSize( size ) ) - return EXIT_FAILURE; - hostVector2.setLike( hostVector ); - if( ! deviceVector2.setLike( deviceVector ) ) - return EXIT_FAILURE; - - Real resultHost, resultDevice; - - - // check functions - auto compare1 = [&]() { - return hostVector == deviceVector; - }; - auto compare2 = [&]() { - return hostVector2 == deviceVector2; - }; - auto compare12 = [&]() { - return compare1() && compare2(); - }; - auto compareScalars = [&]() { - return resultHost == resultDevice; - }; - - // reset functions - auto reset1 = [&]() { - hostVector.setValue( 1.0 ); - deviceVector.setValue( 1.0 ); - }; - auto reset2 = [&]() { - hostVector2.setValue( 1.0 ); - deviceVector2.setValue( 1.0 ); - }; - auto reset12 = [&]() { - reset1(); - reset2(); - }; - - - reset12(); - - cout << "Benchmarking CPU-GPU memory transfer:" << endl; - auto copyAssign = [&]() { - deviceVector = hostVector; - }; - cout << " "; - benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); - - - cout << "Benchmarking vector addition:" << endl; - auto addVectorHost = [&]() { - hostVector.addVector( hostVector2 ); - }; - auto addVectorCuda = [&]() { - deviceVector.addVector( deviceVector2 ); - // TODO: synchronization should be part of addVector - cudaThreadSynchronize(); - }; - benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); - - - cout << "Benchmarking scalar product:" << endl; - auto scalarProductHost = [&]() { - resultHost = hostVector.scalarProduct( hostVector2 ); - }; - auto scalarProductCuda = [&]() { - resultDevice = deviceVector.scalarProduct( deviceVector2 ); - }; - benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); - -/* TODO -#ifdef HAVE_CUBLAS - cout << "Benchmarking scalar product on GPU with Cublas: " << endl; - cublasHandle_t handle; - cublasCreate( &handle ); - timer.reset(); - timer.start(); - for( int i = 0; i < loops; i++ ) - cublasDdot( handle, - size, - deviceVector.getData(), 1, - deviceVector.getData(), 1, - &resultDevice ); - cudaThreadSynchronize(); - timer.stop(); - bandwidth = 2 * datasetSize / timer.getTime(); - cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; -#endif -*/ - - cout << "Benchmarking L2 norm: " << endl; - auto l2normHost = [&]() { - resultHost = hostVector.lpNorm( 2.0 ); - }; - auto l2normCuda = [&]() { - resultDevice = deviceVector.lpNorm( 2.0 ); - }; - benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); - - - /* - cout << "Benchmarking prefix-sum:" << endl; - timer.reset(); - timer.start(); - hostVector.computePrefixSum(); - timer.stop(); - timeHost = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; - - timer.reset(); - timer.start(); - deviceVector.computePrefixSum(); - timer.stop(); - timeDevice = timer.getTime(); - bandwidth = 2 * datasetSize / loops / timer.getTime(); - cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; - cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; - - HostVector auxHostVector; - auxHostVector.setLike( deviceVector ); - auxHostVector = deviceVector; - for( int i = 0; i < size; i++ ) - if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) - { - cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; - } -*/ + benchmarkVectorOperations< Real >( loops, size ); benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow ); diff --git a/tests/benchmarks/vector-operations.h b/tests/benchmarks/vector-operations.h new file mode 100644 index 0000000000000000000000000000000000000000..b15bfeb843b66aeb2face40f9bad7a4d43156739 --- /dev/null +++ b/tests/benchmarks/vector-operations.h @@ -0,0 +1,159 @@ +#pragma once + +#include "benchmarks.h" + +#include <core/vectors/tnlVector.h> + +namespace tnl +{ +namespace benchmarks +{ + +template< typename Real = double, + typename Index = int > +bool +benchmarkVectorOperations( const int & loops, + const int & size ) +{ + typedef tnlVector< Real, tnlHost, Index > HostVector; + typedef tnlVector< Real, tnlCuda, Index > CudaVector; + using namespace std; + + double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; + + HostVector hostVector, hostVector2; + CudaVector deviceVector, deviceVector2; + hostVector.setSize( size ); + if( ! deviceVector.setSize( size ) ) + return false; + hostVector2.setLike( hostVector ); + if( ! deviceVector2.setLike( deviceVector ) ) + return false; + + Real resultHost, resultDevice; + + + // check functions + auto compare1 = [&]() { + return hostVector == deviceVector; + }; + auto compare2 = [&]() { + return hostVector2 == deviceVector2; + }; + auto compare12 = [&]() { + return compare1() && compare2(); + }; + auto compareScalars = [&]() { + return resultHost == resultDevice; + }; + + // reset functions + auto reset1 = [&]() { + hostVector.setValue( 1.0 ); + deviceVector.setValue( 1.0 ); + }; + auto reset2 = [&]() { + hostVector2.setValue( 1.0 ); + deviceVector2.setValue( 1.0 ); + }; + auto reset12 = [&]() { + reset1(); + reset2(); + }; + + + reset12(); + + cout << "Benchmarking CPU-GPU memory transfer:" << endl; + auto copyAssign = [&]() { + deviceVector = hostVector; + }; + cout << " "; + benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); + + + cout << "Benchmarking vector addition:" << endl; + auto addVectorHost = [&]() { + hostVector.addVector( hostVector2 ); + }; + auto addVectorCuda = [&]() { + deviceVector.addVector( deviceVector2 ); + // TODO: synchronization should be part of addVector + cudaThreadSynchronize(); + }; + benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); + + + cout << "Benchmarking scalar product:" << endl; + auto scalarProductHost = [&]() { + resultHost = hostVector.scalarProduct( hostVector2 ); + }; + auto scalarProductCuda = [&]() { + resultDevice = deviceVector.scalarProduct( deviceVector2 ); + }; + benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); + +/* TODO +#ifdef HAVE_CUBLAS + cout << "Benchmarking scalar product on GPU with Cublas: " << endl; + cublasHandle_t handle; + cublasCreate( &handle ); + timer.reset(); + timer.start(); + for( int i = 0; i < loops; i++ ) + cublasDdot( handle, + size, + deviceVector.getData(), 1, + deviceVector.getData(), 1, + &resultDevice ); + cudaThreadSynchronize(); + timer.stop(); + bandwidth = 2 * datasetSize / timer.getTime(); + cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; +#endif +*/ + + cout << "Benchmarking L2 norm: " << endl; + auto l2normHost = [&]() { + resultHost = hostVector.lpNorm( 2.0 ); + }; + auto l2normCuda = [&]() { + resultDevice = deviceVector.lpNorm( 2.0 ); + }; + benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); + + + /* + cout << "Benchmarking prefix-sum:" << endl; + timer.reset(); + timer.start(); + hostVector.computePrefixSum(); + timer.stop(); + timeHost = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; + + timer.reset(); + timer.start(); + deviceVector.computePrefixSum(); + timer.stop(); + timeDevice = timer.getTime(); + bandwidth = 2 * datasetSize / loops / timer.getTime(); + cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; + cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; + + HostVector auxHostVector; + auxHostVector.setLike( deviceVector ); + auxHostVector = deviceVector; + for( int i = 0; i < size; i++ ) + if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) + { + cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; + } + */ + + return true; +} + +} // namespace benchmarks +} // namespace tnl