Loading tests/benchmarks/tnl-cuda-benchmarks.h +141 −102 Original line number Diff line number Diff line Loading @@ -41,6 +41,8 @@ const double oneGB = 1024.0 * 1024.0 * 1024.0; // check operations with the timer: // - reset() clears the timer and starts it again // - getTime() stops the timer and starts it again !!! // - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer // FIXME: scalarProduct is not const method template< typename Matrix > Loading Loading @@ -107,19 +109,86 @@ void setCudaTestMatrix( Matrix& matrix, tnlCuda::freeFromDevice( kernel_matrix ); } template<typename Function, typename... Args> double time_void_function(int loops, Function & f, Args & ...args) // TODO: add data member for error message struct BenchmarkError {}; auto trueFunc = []() { return true; }; auto voidFunc = [](){}; template< typename ComputeFunction, typename CheckFunction, typename ResetFunction > double benchmarkSingle( const int & loops, const double & datasetSize, // in GB ComputeFunction & compute, // TODO: check that default argument works here CheckFunction & check = trueFunc, ResetFunction & reset = voidFunc ) { tnlTimerRT timer; timer.reset(); for(int i = 0; i < loops; ++i) { timer.start(); f(args...); compute(); timer.stop(); if( ! check() ) throw BenchmarkError(); reset(); } return timer.getTime(); const double time = timer.getTime(); const double bandwidth = datasetSize / time; cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << endl; return time; } template< typename ComputeHostFunction, typename ComputeCudaFunction, typename CheckFunction, typename ResetFunction > void benchmarkCuda( const int & loops, const double & datasetSize, // in GB ComputeHostFunction & computeHost, ComputeCudaFunction & computeCuda, // TODO: check that default argument works here CheckFunction & check = trueFunc, ResetFunction & reset = voidFunc ) { tnlTimerRT timerHost, timerCuda; timerHost.reset(); timerHost.stop(); timerCuda.reset(); timerCuda.stop(); for(int i = 0; i < loops; ++i) { timerHost.start(); computeHost(); timerHost.stop(); timerCuda.start(); computeCuda(); timerCuda.stop(); if( ! check() ) throw BenchmarkError(); reset(); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << endl; cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeCuda << endl; } template< typename Real, Loading Loading @@ -169,43 +238,33 @@ benchmarkSpMV( const int & loops, return false; } double bandwidth( 0.0 ), datasetSize( 0.0 ), timeHost( 0.0 ), timeDevice( 0.0 ); tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl; const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); auto spmvHost = []( const HostMatrix & m, const HostVector & x, HostVector & y ) { m.vectorProduct( x, y ); // check and reset functions auto check = [&]() { return hostVector2 == deviceVector2; }; timeHost = time_void_function( loops, spmvHost, hostMatrix, hostVector, hostVector2 ); bandwidth = datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto spmvCuda = []( const DeviceMatrix & m, const CudaVector & x, CudaVector & y ) { m.vectorProduct( x, y ); auto reset = [&]() { hostVector2.setValue( 0.0 ); deviceVector2.setValue( 0.0 ); }; timeDevice = time_void_function( loops, spmvCuda, deviceMatrix, deviceVector, deviceVector2 ); bandwidth = datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in Spmv kernel" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; } // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); return true; } Loading Loading @@ -234,7 +293,6 @@ int main( int argc, char* argv[] ) elementsPerRow = atoi( argv[ 3 ] ); double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; Loading @@ -246,76 +304,70 @@ int main( int argc, char* argv[] ) if( ! deviceVector2.setLike( deviceVector ) ) return EXIT_FAILURE; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; double bandwidth( 0.0 ); Real resultHost, resultDevice, timeHost, timeDevice; reset12(); cout << "Benchmarking CPU-GPU memory bandwidth: "; auto copyAssign = []( CudaVector & v1, const HostVector & v2 ) { v1 = v2; cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; timeHost = time_void_function( loops, copyAssign, deviceVector, hostVector ); bandwidth = datasetSize / timeHost; cout << bandwidth << " GB/sec." << endl; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = []( HostVector & v1, const HostVector & v2 ) { v1.addVector( v2 ); auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; timeHost = time_void_function( loops, addVectorHost, hostVector, hostVector2 ); bandwidth = 3 * datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto addVectorCuda = []( CudaVector & v1, const CudaVector & v2 ) { v1.addVector( v2 ); auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; timeDevice = time_void_function( loops, addVectorCuda, deviceVector, deviceVector2 ); bandwidth = 3 * datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; // FIXME: scalarProduct is not const method // auto scalarProductHost = []( const HostVector & v1, const HostVector & v2 ) { auto scalarProductHost = []( HostVector & v1, const HostVector & v2 ) { return v1.scalarProduct( v2 ); auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; timeHost = time_void_function( loops, scalarProductHost, hostVector, hostVector2 ); bandwidth = 2 * datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; // FIXME: scalarProduct is not const method // auto scalarProductCuda = []( const CudaVector & v1, const CudaVector & v2 ) { auto scalarProductCuda = []( CudaVector & v1, const CudaVector & v2 ) { return v1.scalarProduct( v2 ); auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; timeDevice = time_void_function( loops, scalarProductCuda, deviceVector, deviceVector2 ); bandwidth = 2 * datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; // TODO: devise a way to check the result of the timed function // if( resultHost != resultDevice ) // { // cerr << "Error. " << resultHost << " != " << resultDevice << endl; //return EXIT_FAILURE; // } benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; Loading @@ -333,29 +385,16 @@ int main( int argc, char* argv[] ) bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = []( const HostVector & v ) { return v.lpNorm( 2.0 ); auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; timeHost = time_void_function( loops, l2normHost, hostVector ); bandwidth = datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto l2normCuda = []( const CudaVector & v ) { return v.lpNorm( 2.0 ); auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; timeDevice = time_void_function( loops, l2normCuda, deviceVector ); bandwidth = datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; // TODO: devise a way to check the result of the timed function // if( resultHost != resultDevice ) // { // cerr << "Error. " << resultHost << " != " << resultDevice << endl; //return EXIT_FAILURE; // } benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* Loading Loading
tests/benchmarks/tnl-cuda-benchmarks.h +141 −102 Original line number Diff line number Diff line Loading @@ -41,6 +41,8 @@ const double oneGB = 1024.0 * 1024.0 * 1024.0; // check operations with the timer: // - reset() clears the timer and starts it again // - getTime() stops the timer and starts it again !!! // - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer // FIXME: scalarProduct is not const method template< typename Matrix > Loading Loading @@ -107,19 +109,86 @@ void setCudaTestMatrix( Matrix& matrix, tnlCuda::freeFromDevice( kernel_matrix ); } template<typename Function, typename... Args> double time_void_function(int loops, Function & f, Args & ...args) // TODO: add data member for error message struct BenchmarkError {}; auto trueFunc = []() { return true; }; auto voidFunc = [](){}; template< typename ComputeFunction, typename CheckFunction, typename ResetFunction > double benchmarkSingle( const int & loops, const double & datasetSize, // in GB ComputeFunction & compute, // TODO: check that default argument works here CheckFunction & check = trueFunc, ResetFunction & reset = voidFunc ) { tnlTimerRT timer; timer.reset(); for(int i = 0; i < loops; ++i) { timer.start(); f(args...); compute(); timer.stop(); if( ! check() ) throw BenchmarkError(); reset(); } return timer.getTime(); const double time = timer.getTime(); const double bandwidth = datasetSize / time; cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << endl; return time; } template< typename ComputeHostFunction, typename ComputeCudaFunction, typename CheckFunction, typename ResetFunction > void benchmarkCuda( const int & loops, const double & datasetSize, // in GB ComputeHostFunction & computeHost, ComputeCudaFunction & computeCuda, // TODO: check that default argument works here CheckFunction & check = trueFunc, ResetFunction & reset = voidFunc ) { tnlTimerRT timerHost, timerCuda; timerHost.reset(); timerHost.stop(); timerCuda.reset(); timerCuda.stop(); for(int i = 0; i < loops; ++i) { timerHost.start(); computeHost(); timerHost.stop(); timerCuda.start(); computeCuda(); timerCuda.stop(); if( ! check() ) throw BenchmarkError(); reset(); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << endl; cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeCuda << endl; } template< typename Real, Loading Loading @@ -169,43 +238,33 @@ benchmarkSpMV( const int & loops, return false; } double bandwidth( 0.0 ), datasetSize( 0.0 ), timeHost( 0.0 ), timeDevice( 0.0 ); tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl; const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); auto spmvHost = []( const HostMatrix & m, const HostVector & x, HostVector & y ) { m.vectorProduct( x, y ); // check and reset functions auto check = [&]() { return hostVector2 == deviceVector2; }; timeHost = time_void_function( loops, spmvHost, hostMatrix, hostVector, hostVector2 ); bandwidth = datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto spmvCuda = []( const DeviceMatrix & m, const CudaVector & x, CudaVector & y ) { m.vectorProduct( x, y ); auto reset = [&]() { hostVector2.setValue( 0.0 ); deviceVector2.setValue( 0.0 ); }; timeDevice = time_void_function( loops, spmvCuda, deviceMatrix, deviceVector, deviceVector2 ); bandwidth = datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; //cout << hostVector2 << endl << deviceVector2 << endl; if( hostVector2 != deviceVector2 ) { cerr << "Error in Spmv kernel" << endl; //for( int i = 0; i < size; i++ ) // if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) ) // cerr << " " << i; } // compute functions auto spmvHost = [&]() { hostMatrix.vectorProduct( hostVector, hostVector2 ); }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); return true; } Loading Loading @@ -234,7 +293,6 @@ int main( int argc, char* argv[] ) elementsPerRow = atoi( argv[ 3 ] ); double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB; HostVector hostVector, hostVector2; Loading @@ -246,76 +304,70 @@ int main( int argc, char* argv[] ) if( ! deviceVector2.setLike( deviceVector ) ) return EXIT_FAILURE; Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); }; auto reset2 = [&]() { hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); }; auto reset12 = [&]() { reset1(); reset2(); }; double bandwidth( 0.0 ); Real resultHost, resultDevice, timeHost, timeDevice; reset12(); cout << "Benchmarking CPU-GPU memory bandwidth: "; auto copyAssign = []( CudaVector & v1, const HostVector & v2 ) { v1 = v2; cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssign = [&]() { deviceVector = hostVector; }; timeHost = time_void_function( loops, copyAssign, deviceVector, hostVector ); bandwidth = datasetSize / timeHost; cout << bandwidth << " GB/sec." << endl; cout << " "; benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = []( HostVector & v1, const HostVector & v2 ) { v1.addVector( v2 ); auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; timeHost = time_void_function( loops, addVectorHost, hostVector, hostVector2 ); bandwidth = 3 * datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto addVectorCuda = []( CudaVector & v1, const CudaVector & v2 ) { v1.addVector( v2 ); auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; timeDevice = time_void_function( loops, addVectorCuda, deviceVector, deviceVector2 ); bandwidth = 3 * datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); hostVector2.setValue( 1.0 ); deviceVector2.setValue( 1.0 ); benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); cout << "Benchmarking scalar product:" << endl; // FIXME: scalarProduct is not const method // auto scalarProductHost = []( const HostVector & v1, const HostVector & v2 ) { auto scalarProductHost = []( HostVector & v1, const HostVector & v2 ) { return v1.scalarProduct( v2 ); auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; timeHost = time_void_function( loops, scalarProductHost, hostVector, hostVector2 ); bandwidth = 2 * datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; // FIXME: scalarProduct is not const method // auto scalarProductCuda = []( const CudaVector & v1, const CudaVector & v2 ) { auto scalarProductCuda = []( CudaVector & v1, const CudaVector & v2 ) { return v1.scalarProduct( v2 ); auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; timeDevice = time_void_function( loops, scalarProductCuda, deviceVector, deviceVector2 ); bandwidth = 2 * datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; // TODO: devise a way to check the result of the timed function // if( resultHost != resultDevice ) // { // cerr << "Error. " << resultHost << " != " << resultDevice << endl; //return EXIT_FAILURE; // } benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; Loading @@ -333,29 +385,16 @@ int main( int argc, char* argv[] ) bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; #endif */ cout << "Benchmarking L2 norm: " << endl; auto l2normHost = []( const HostVector & v ) { return v.lpNorm( 2.0 ); auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; timeHost = time_void_function( loops, l2normHost, hostVector ); bandwidth = datasetSize / timeHost; cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl; auto l2normCuda = []( const CudaVector & v ) { return v.lpNorm( 2.0 ); auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; timeDevice = time_void_function( loops, l2normCuda, deviceVector ); bandwidth = datasetSize / timeDevice; cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl; cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl; // TODO: devise a way to check the result of the timed function // if( resultHost != resultDevice ) // { // cerr << "Error. " << resultHost << " != " << resultDevice << endl; //return EXIT_FAILURE; // } benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); /* Loading