Skip to content
Snippets Groups Projects
Commit 5e5d94ce authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring benchmarks using more lambda functions

parent 8bb94c12
No related branches found
No related tags found
No related merge requests found
......@@ -41,6 +41,8 @@ const double oneGB = 1024.0 * 1024.0 * 1024.0;
// check operations with the timer:
// - reset() clears the timer and starts it again
// - getTime() stops the timer and starts it again !!!
// - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer
// FIXME: scalarProduct is not const method
template< typename Matrix >
......@@ -107,19 +109,86 @@ void setCudaTestMatrix( Matrix& matrix,
tnlCuda::freeFromDevice( kernel_matrix );
}
template<typename Function, typename... Args>
double time_void_function(int loops, Function & f, Args & ...args)
// TODO: add data member for error message
struct BenchmarkError {};
auto trueFunc = []() { return true; };
auto voidFunc = [](){};
template< typename ComputeFunction,
typename CheckFunction,
typename ResetFunction >
double
benchmarkSingle( const int & loops,
const double & datasetSize, // in GB
ComputeFunction & compute,
// TODO: check that default argument works here
CheckFunction & check = trueFunc,
ResetFunction & reset = voidFunc )
{
tnlTimerRT timer;
timer.reset();
for(int i = 0; i < loops; ++i) {
timer.start();
f(args...);
compute();
timer.stop();
if( ! check() )
throw BenchmarkError();
reset();
}
const double time = timer.getTime();
const double bandwidth = datasetSize / time;
cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << endl;
return time;
}
template< typename ComputeHostFunction,
typename ComputeCudaFunction,
typename CheckFunction,
typename ResetFunction >
void
benchmarkCuda( const int & loops,
const double & datasetSize, // in GB
ComputeHostFunction & computeHost,
ComputeCudaFunction & computeCuda,
// TODO: check that default argument works here
CheckFunction & check = trueFunc,
ResetFunction & reset = voidFunc )
{
tnlTimerRT timerHost, timerCuda;
timerHost.reset();
timerHost.stop();
timerCuda.reset();
timerCuda.stop();
for(int i = 0; i < loops; ++i) {
timerHost.start();
computeHost();
timerHost.stop();
timerCuda.start();
computeCuda();
timerCuda.stop();
if( ! check() )
throw BenchmarkError();
reset();
}
return timer.getTime();
const double timeHost = timerHost.getTime();
const double timeCuda = timerCuda.getTime();
const double bandwidthHost = datasetSize / timeHost;
const double bandwidthCuda = datasetSize / timeCuda;
cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << endl;
cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << endl;
cout << " CPU/GPU speedup: " << timeHost / timeCuda << endl;
}
template< typename Real,
......@@ -169,43 +238,33 @@ benchmarkSpMV( const int & loops,
return false;
}
double bandwidth( 0.0 ), datasetSize( 0.0 ), timeHost( 0.0 ), timeDevice( 0.0 );
tnlList< tnlString > parsedType;
parseObjectType( HostMatrix::getType(), parsedType );
cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl;
const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
hostVector.setValue( 1.0 );
deviceVector.setValue( 1.0 );
auto spmvHost = []( const HostMatrix & m, const HostVector & x, HostVector & y ) {
m.vectorProduct( x, y );
// check and reset functions
auto check = [&]() {
return hostVector2 == deviceVector2;
};
timeHost = time_void_function( loops, spmvHost, hostMatrix, hostVector, hostVector2 );
bandwidth = datasetSize / timeHost;
cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl;
auto spmvCuda = []( const DeviceMatrix & m, const CudaVector & x, CudaVector & y ) {
m.vectorProduct( x, y );
auto reset = [&]() {
hostVector2.setValue( 0.0 );
deviceVector2.setValue( 0.0 );
};
timeDevice = time_void_function( loops, spmvCuda, deviceMatrix, deviceVector, deviceVector2 );
bandwidth = datasetSize / timeDevice;
cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl;
cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl;
//cout << hostVector2 << endl << deviceVector2 << endl;
if( hostVector2 != deviceVector2 )
{
cerr << "Error in Spmv kernel" << endl;
//for( int i = 0; i < size; i++ )
// if( hostVector2.getElement( i ) != deviceVector2.getElement( i ) )
// cerr << " " << i;
}
// compute functions
auto spmvHost = [&]() {
hostMatrix.vectorProduct( hostVector, hostVector2 );
};
auto spmvCuda = [&]() {
deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
};
benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset );
return true;
}
......@@ -234,7 +293,6 @@ int main( int argc, char* argv[] )
elementsPerRow = atoi( argv[ 3 ] );
double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
HostVector hostVector, hostVector2;
......@@ -246,76 +304,70 @@ int main( int argc, char* argv[] )
if( ! deviceVector2.setLike( deviceVector ) )
return EXIT_FAILURE;
hostVector.setValue( 1.0 );
deviceVector.setValue( 1.0 );
hostVector2.setValue( 1.0 );
deviceVector2.setValue( 1.0 );
Real resultHost, resultDevice;
// check functions
auto compare1 = [&]() {
return hostVector == deviceVector;
};
auto compare2 = [&]() {
return hostVector2 == deviceVector2;
};
auto compare12 = [&]() {
return compare1() && compare2();
};
auto compareScalars = [&]() {
return resultHost == resultDevice;
};
// reset functions
auto reset1 = [&]() {
hostVector.setValue( 1.0 );
deviceVector.setValue( 1.0 );
};
auto reset2 = [&]() {
hostVector2.setValue( 1.0 );
deviceVector2.setValue( 1.0 );
};
auto reset12 = [&]() {
reset1();
reset2();
};
double bandwidth( 0.0 );
Real resultHost, resultDevice, timeHost, timeDevice;
reset12();
cout << "Benchmarking CPU-GPU memory bandwidth: ";
auto copyAssign = []( CudaVector & v1, const HostVector & v2 ) {
v1 = v2;
cout << "Benchmarking CPU-GPU memory transfer:" << endl;
auto copyAssign = [&]() {
deviceVector = hostVector;
};
timeHost = time_void_function( loops, copyAssign, deviceVector, hostVector );
bandwidth = datasetSize / timeHost;
cout << bandwidth << " GB/sec." << endl;
cout << " ";
benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 );
cout << "Benchmarking vector addition:" << endl;
auto addVectorHost = []( HostVector & v1, const HostVector & v2 ) {
v1.addVector( v2 );
auto addVectorHost = [&]() {
hostVector.addVector( hostVector2 );
};
timeHost = time_void_function( loops, addVectorHost, hostVector, hostVector2 );
bandwidth = 3 * datasetSize / timeHost;
cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl;
auto addVectorCuda = []( CudaVector & v1, const CudaVector & v2 ) {
v1.addVector( v2 );
auto addVectorCuda = [&]() {
deviceVector.addVector( deviceVector2 );
// TODO: synchronization should be part of addVector
cudaThreadSynchronize();
};
timeDevice = time_void_function( loops, addVectorCuda, deviceVector, deviceVector2 );
bandwidth = 3 * datasetSize / timeDevice;
cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl;
cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl;
hostVector.setValue( 1.0 );
deviceVector.setValue( 1.0 );
hostVector2.setValue( 1.0 );
deviceVector2.setValue( 1.0 );
benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
cout << "Benchmarking scalar product:" << endl;
// FIXME: scalarProduct is not const method
// auto scalarProductHost = []( const HostVector & v1, const HostVector & v2 ) {
auto scalarProductHost = []( HostVector & v1, const HostVector & v2 ) {
return v1.scalarProduct( v2 );
auto scalarProductHost = [&]() {
resultHost = hostVector.scalarProduct( hostVector2 );
};
timeHost = time_void_function( loops, scalarProductHost, hostVector, hostVector2 );
bandwidth = 2 * datasetSize / timeHost;
cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl;
// FIXME: scalarProduct is not const method
// auto scalarProductCuda = []( const CudaVector & v1, const CudaVector & v2 ) {
auto scalarProductCuda = []( CudaVector & v1, const CudaVector & v2 ) {
return v1.scalarProduct( v2 );
auto scalarProductCuda = [&]() {
resultDevice = deviceVector.scalarProduct( deviceVector2 );
};
timeDevice = time_void_function( loops, scalarProductCuda, deviceVector, deviceVector2 );
bandwidth = 2 * datasetSize / timeDevice;
cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl;
cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl;
// TODO: devise a way to check the result of the timed function
// if( resultHost != resultDevice )
// {
// cerr << "Error. " << resultHost << " != " << resultDevice << endl;
//return EXIT_FAILURE;
// }
benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );
/* TODO
#ifdef HAVE_CUBLAS
cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
cublasHandle_t handle;
......@@ -333,29 +385,16 @@ int main( int argc, char* argv[] )
bandwidth = 2 * datasetSize / timer.getTime();
cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
#endif
*/
cout << "Benchmarking L2 norm: " << endl;
auto l2normHost = []( const HostVector & v ) {
return v.lpNorm( 2.0 );
auto l2normHost = [&]() {
resultHost = hostVector.lpNorm( 2.0 );
};
timeHost = time_void_function( loops, l2normHost, hostVector );
bandwidth = datasetSize / timeHost;
cout << " CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeHost << " sec." << endl;
auto l2normCuda = []( const CudaVector & v ) {
return v.lpNorm( 2.0 );
auto l2normCuda = [&]() {
resultDevice = deviceVector.lpNorm( 2.0 );
};
timeDevice = time_void_function( loops, l2normCuda, deviceVector );
bandwidth = datasetSize / timeDevice;
cout << " GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timeDevice << " sec." << endl;
cout << " CPU/GPU speedup: " << timeHost / timeDevice << endl;
// TODO: devise a way to check the result of the timed function
// if( resultHost != resultDevice )
// {
// cerr << "Error. " << resultHost << " != " << resultDevice << endl;
//return EXIT_FAILURE;
// }
benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
/*
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment