Loading tests/benchmarks/benchmarks.h +2 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,8 @@ namespace tnl namespace benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: add data member for error message struct BenchmarkError {}; Loading tests/benchmarks/tnl-cuda-benchmarks.h +2 −139 Original line number Diff line number Diff line Loading @@ -19,13 +19,12 @@ #define TNLCUDBENCHMARKS_H_ #include <tnlConfig.h> #include <core/vectors/tnlVector.h> #include <core/tnlList.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #include <matrices/tnlCSRMatrix.h> #include "benchmarks.h" #include "vector-operations.h" using namespace tnl::benchmarks; Loading @@ -37,7 +36,6 @@ using namespace tnl::benchmarks; template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: Loading Loading @@ -199,9 +197,6 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA typedef double Real; typedef tnlVector< Real, tnlHost > HostVector; typedef tnlVector< Real, tnlCuda > CudaVector; /**** * The first argument of this program is the size od data set to be reduced. Loading @@ -217,139 +212,7 @@ int main( int argc, char* argv[] ) if( argc > 3 ) elementsPerRow = atoi( argv[ 3 ] ); double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2; hostVector.setSize( size ); if( ! deviceVector.setSize( size ) ) return EXIT_FAILURE; hostVector2.setLike( hostVector ); if( ! deviceVector2.setLike( deviceVector ) ) return EXIT_FAILURE; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, deviceVector.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* cout << "Benchmarking prefix-sum:" << endl; timer.reset(); timer.start(); hostVector.computePrefixSum(); timer.stop(); timeHost = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; timer.reset(); timer.start(); deviceVector.computePrefixSum(); timer.stop(); timeDevice = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; HostVector auxHostVector; auxHostVector.setLike( deviceVector ); auxHostVector = deviceVector; for( int i = 0; i < size; i++ ) if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) { cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; } */ benchmarkVectorOperations< Real >( loops, size ); benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow ); Loading tests/benchmarks/vector-operations.h 0 → 100644 +159 −0 Original line number Diff line number Diff line #pragma once #include "benchmarks.h" #include <core/vectors/tnlVector.h> namespace tnl { namespace benchmarks { template< typename Real = double, typename Index = int > bool benchmarkVectorOperations( const int & loops, const int & size ) { typedef tnlVector< Real, tnlHost, Index > HostVector; typedef tnlVector< Real, tnlCuda, Index > CudaVector; using namespace std; double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2; hostVector.setSize( size ); if( ! deviceVector.setSize( size ) ) return false; hostVector2.setLike( hostVector ); if( ! deviceVector2.setLike( deviceVector ) ) return false; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, deviceVector.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* cout << "Benchmarking prefix-sum:" << endl; timer.reset(); timer.start(); hostVector.computePrefixSum(); timer.stop(); timeHost = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; timer.reset(); timer.start(); deviceVector.computePrefixSum(); timer.stop(); timeDevice = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; HostVector auxHostVector; auxHostVector.setLike( deviceVector ); auxHostVector = deviceVector; for( int i = 0; i < size; i++ ) if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) { cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; } */ return true; } } // namespace benchmarks } // namespace tnl Loading
tests/benchmarks/benchmarks.h +2 −0 Original line number Diff line number Diff line Loading @@ -9,6 +9,8 @@ namespace tnl namespace benchmarks { const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: add data member for error message struct BenchmarkError {}; Loading
tests/benchmarks/tnl-cuda-benchmarks.h +2 −139 Original line number Diff line number Diff line Loading @@ -19,13 +19,12 @@ #define TNLCUDBENCHMARKS_H_ #include <tnlConfig.h> #include <core/vectors/tnlVector.h> #include <core/tnlList.h> #include <matrices/tnlSlicedEllpackMatrix.h> #include <matrices/tnlEllpackMatrix.h> #include <matrices/tnlCSRMatrix.h> #include "benchmarks.h" #include "vector-operations.h" using namespace tnl::benchmarks; Loading @@ -37,7 +36,6 @@ using namespace tnl::benchmarks; template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: Loading Loading @@ -199,9 +197,6 @@ int main( int argc, char* argv[] ) #ifdef HAVE_CUDA typedef double Real; typedef tnlVector< Real, tnlHost > HostVector; typedef tnlVector< Real, tnlCuda > CudaVector; /**** * The first argument of this program is the size od data set to be reduced. Loading @@ -217,139 +212,7 @@ int main( int argc, char* argv[] ) if( argc > 3 ) elementsPerRow = atoi( argv[ 3 ] ); double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2; hostVector.setSize( size ); if( ! deviceVector.setSize( size ) ) return EXIT_FAILURE; hostVector2.setLike( hostVector ); if( ! deviceVector2.setLike( deviceVector ) ) return EXIT_FAILURE; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, deviceVector.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* cout << "Benchmarking prefix-sum:" << endl; timer.reset(); timer.start(); hostVector.computePrefixSum(); timer.stop(); timeHost = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; timer.reset(); timer.start(); deviceVector.computePrefixSum(); timer.stop(); timeDevice = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; HostVector auxHostVector; auxHostVector.setLike( deviceVector ); auxHostVector = deviceVector; for( int i = 0; i < size; i++ ) if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) { cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; } */ benchmarkVectorOperations< Real >( loops, size ); benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow ); benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow ); Loading
tests/benchmarks/vector-operations.h 0 → 100644 +159 −0 Original line number Diff line number Diff line #pragma once #include "benchmarks.h" #include <core/vectors/tnlVector.h> namespace tnl { namespace benchmarks { template< typename Real = double, typename Index = int > bool benchmarkVectorOperations( const int & loops, const int & size ) { typedef tnlVector< Real, tnlHost, Index > HostVector; typedef tnlVector< Real, tnlCuda, Index > CudaVector; using namespace std; double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; CudaVector deviceVector, deviceVector2; hostVector.setSize( size ); if( ! deviceVector.setSize( size ) ) return false; hostVector2.setLike( hostVector ); if( ! deviceVector2.setLike( deviceVector ) ) return false; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; reset12(); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, deviceVector.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* cout << "Benchmarking prefix-sum:" << endl; timer.reset(); timer.start(); hostVector.computePrefixSum(); timer.stop(); timeHost = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; timer.reset(); timer.start(); deviceVector.computePrefixSum(); timer.stop(); timeDevice = timer.getTime(); bandwidth = 2 * datasetSize / loops / timer.getTime(); cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; HostVector auxHostVector; auxHostVector.setLike( deviceVector ); auxHostVector = deviceVector; for( int i = 0; i < size; i++ ) if( hostVector.getElement( i ) != auxHostVector.getElement( i ) ) { cerr << "Error in prefix sum at position " << i << ": " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl; } */ return true; } } // namespace benchmarks } // namespace tnl