Loading tests/benchmarks/benchmarks.h +70 −66 Original line number Diff line number Diff line #pragma once #include <iostream> #include <iomanip> #include <core/tnlTimerRT.h> Loading @@ -11,95 +12,98 @@ namespace benchmarks const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: add data member for error message struct BenchmarkError {}; auto trueFunc = []() { return true; }; auto voidFunc = [](){}; template< typename ComputeFunction, typename CheckFunction, typename ResetFunction > double benchmarkSingle( const int & loops, timeFunction( ComputeFunction compute, ResetFunction reset, const int & loops, const double & datasetSize, // in GB ComputeFunction compute, CheckFunction check, ResetFunction reset ) const double & baseTime, // in seconds (baseline for speedup calculation) const char* performer ) { // the timer is constructed zero-initialized and stopped tnlTimerRT timer; timer.reset(); reset(); for(int i = 0; i < loops; ++i) { // TODO: not necessary for host computations // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif timer.start(); compute(); #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif timer.stop(); if( ! check() ) throw BenchmarkError(); reset(); } const double time = timer.getTime(); const double bandwidth = datasetSize / time; std::cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << std::endl; using namespace std; cout << " " << performer << ": bandwidth: " << setw( 8 ) << bandwidth << " GB/sec, time: " << setw( 8 ) << time << " sec, speedup: "; if( baseTime ) cout << baseTime / time << endl; else cout << "N/A" << endl; return time; } template< typename ComputeHostFunction, typename ComputeCudaFunction, typename CheckFunction, typename ResetFunction > void benchmarkCuda( const int & loops, const double & datasetSize, // in GB ComputeHostFunction computeHost, ComputeCudaFunction computeCuda, CheckFunction check, ResetFunction reset ) // This specialization terminates the recursion template< typename ResetFunction, typename ComputeFunction > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute ) { // timers are constructed zero-initialized and stopped tnlTimerRT timerHost, timerCuda, timerCudaSync; for(int i = 0; i < loops; ++i) { timerHost.start(); computeHost(); timerHost.stop(); timerCuda.start(); computeCuda(); timerCuda.stop(); if( ! check() ) throw BenchmarkError(); reset(); // Compute again on CUDA, with explicit synchronization #ifdef HAVE_CUDA cudaDeviceSynchronize(); timerCudaSync.start(); computeCuda(); cudaDeviceSynchronize(); timerCudaSync.stop(); #endif timeFunction( compute, reset, loops, datasetSize, baseTime, performer ); } reset(); // Recursive template function to deal with benchmarks involving multiple computations template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute, NextComputations & ... nextComputations ) { benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double timeCudaSync = timerCudaSync.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; const double bandwidthCudaSync = datasetSize / timeCudaSync; std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl; std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl; std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl; std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl; std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl; // Main function for benchmarking template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > void benchmarkOperation( const char* operation, const double & datasetSize, const int & loops, ResetFunction reset, const char* performer, ComputeFunction computeBase, NextComputations... nextComputations ) { cout << "Benchmarking " << operation << ":" << endl; double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); std::cout << std::endl; } Loading tests/benchmarks/tnl-cuda-benchmarks.h +10 −8 Original line number Diff line number Diff line Loading @@ -29,6 +29,8 @@ using namespace tnl::benchmarks; // TODO: should benchmarks check the result of the computation? // silly alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; Loading Loading @@ -147,19 +149,17 @@ benchmarkSpMV( const int & loops, tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl; tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ] + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")"; const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); // check and reset functions auto check = [&]() { return hostVector2 == deviceVector2; }; // reset function auto reset = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); hostVector2.setValue( 0.0 ); deviceVector2.setValue( 0.0 ); }; Loading @@ -172,7 +172,9 @@ benchmarkSpMV( const int & loops, deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); benchmarkOperation( operationDescription.getString(), 2 * datasetSize, loops, reset, "CPU", spmvHost, "GPU", spmvCuda ); return true; } Loading tests/benchmarks/vector-operations.h +44 −51 Original line number Diff line number Diff line Loading @@ -37,21 +37,10 @@ benchmarkVectorOperations( const int & loops, Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions // (Make sure to always use some in benchmarks, even if it's not necessary // to assure correct result - it helps to clear cache and avoid optimizations // of the benchmark loop.) auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); Loading @@ -68,148 +57,152 @@ benchmarkVectorOperations( const int & loops, reset12(); cout << "Benchmarking CPU-CPU memory transfer:" << endl; auto copyAssignHostHost = [&]() { hostVector = hostVector2; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignHostHost, trueFunc, reset1 ); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssignHostCuda = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignHostCuda, compare1, reset1 ); cout << "Benchmarking GPU-GPU memory transfer:" << endl; auto copyAssignCudaCuda = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignCudaCuda, trueFunc, reset1 ); cout << endl; benchmarkOperation( "copy assigment", datasetSize, loops, reset1, "CPU->CPU", copyAssignHostHost, "CPU->GPU", copyAssignHostCuda, "GPU->GPU", copyAssignCudaCuda ); cout << "Benchmarking tnlVector.operator==" << endl; auto compareHost = [&]() { resultHost = (int) hostVector == hostVector2; }; auto compareCuda = [&]() { resultDevice = (int) deviceVector == deviceVector2; }; benchmarkCuda( loops, 2 * datasetSize, compareHost, compareCuda, compareScalars, voidFunc ); benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1, "CPU", compareHost, "GPU", compareCuda ); cout << "Benchmarking scalar multiplication:" << endl; auto multiplyHost = [&]() { hostVector *= 0.5; }; auto multiplyCuda = [&]() { deviceVector *= 0.5; }; benchmarkCuda( loops, 2 * datasetSize, multiplyHost, multiplyCuda, compare1, reset1 ); benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1, "CPU", multiplyHost, "GPU", multiplyCuda ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1, "CPU", addVectorHost, "GPU", addVectorCuda ); cout << "Benchmarking max:" << endl; auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmarkCuda( loops, datasetSize, maxHost, maxCuda, compareScalars, voidFunc ); benchmarkOperation( "max", datasetSize, loops, reset1, "CPU", maxHost, "GPU", maxCuda ); cout << "Benchmarking min:" << endl; auto minHost = [&]() { resultHost = hostVector.min(); }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmarkCuda( loops, datasetSize, minHost, minCuda, compareScalars, voidFunc ); benchmarkOperation( "min", datasetSize, loops, reset1, "CPU", minHost, "GPU", minCuda ); cout << "Benchmarking absMax:" << endl; auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; benchmarkCuda( loops, datasetSize, absMaxHost, absMaxCuda, compareScalars, voidFunc ); benchmarkOperation( "absMax", datasetSize, loops, reset1, "CPU", absMaxHost, "GPU", absMaxCuda ); cout << "Benchmarking absMin:" << endl; auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; benchmarkCuda( loops, datasetSize, absMinHost, absMinCuda, compareScalars, voidFunc ); benchmarkOperation( "absMin", datasetSize, loops, reset1, "CPU", absMinHost, "GPU", absMinCuda ); cout << "Benchmarking sum:" << endl; auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmarkCuda( loops, datasetSize, sumHost, sumCuda, compareScalars, voidFunc ); benchmarkOperation( "sum", datasetSize, loops, reset1, "CPU", sumHost, "GPU", sumCuda ); cout << "Benchmarking l1 norm: " << endl; auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; benchmarkCuda( loops, datasetSize, l1normHost, l1normCuda, compareScalars, voidFunc ); benchmarkOperation( "l1 norm", datasetSize, loops, reset1, "CPU", l1normHost, "GPU", l1normCuda ); cout << "Benchmarking l2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); benchmarkOperation( "l2 norm", datasetSize, loops, reset1, "CPU", l2normHost, "GPU", l2normCuda ); cout << "Benchmarking l3 norm: " << endl; auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmarkCuda( loops, datasetSize, l3normHost, l3normCuda, compareScalars, voidFunc ); benchmarkOperation( "l3 norm", datasetSize, loops, reset1, "CPU", l3normHost, "GPU", l3normCuda ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda ); /* TODO #ifdef HAVE_CUBLAS Loading Loading
tests/benchmarks/benchmarks.h +70 −66 Original line number Diff line number Diff line #pragma once #include <iostream> #include <iomanip> #include <core/tnlTimerRT.h> Loading @@ -11,95 +12,98 @@ namespace benchmarks const double oneGB = 1024.0 * 1024.0 * 1024.0; // TODO: add data member for error message struct BenchmarkError {}; auto trueFunc = []() { return true; }; auto voidFunc = [](){}; template< typename ComputeFunction, typename CheckFunction, typename ResetFunction > double benchmarkSingle( const int & loops, timeFunction( ComputeFunction compute, ResetFunction reset, const int & loops, const double & datasetSize, // in GB ComputeFunction compute, CheckFunction check, ResetFunction reset ) const double & baseTime, // in seconds (baseline for speedup calculation) const char* performer ) { // the timer is constructed zero-initialized and stopped tnlTimerRT timer; timer.reset(); reset(); for(int i = 0; i < loops; ++i) { // TODO: not necessary for host computations // Explicit synchronization of the CUDA device #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif timer.start(); compute(); #ifdef HAVE_CUDA cudaDeviceSynchronize(); #endif timer.stop(); if( ! check() ) throw BenchmarkError(); reset(); } const double time = timer.getTime(); const double bandwidth = datasetSize / time; std::cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << std::endl; using namespace std; cout << " " << performer << ": bandwidth: " << setw( 8 ) << bandwidth << " GB/sec, time: " << setw( 8 ) << time << " sec, speedup: "; if( baseTime ) cout << baseTime / time << endl; else cout << "N/A" << endl; return time; } template< typename ComputeHostFunction, typename ComputeCudaFunction, typename CheckFunction, typename ResetFunction > void benchmarkCuda( const int & loops, const double & datasetSize, // in GB ComputeHostFunction computeHost, ComputeCudaFunction computeCuda, CheckFunction check, ResetFunction reset ) // This specialization terminates the recursion template< typename ResetFunction, typename ComputeFunction > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute ) { // timers are constructed zero-initialized and stopped tnlTimerRT timerHost, timerCuda, timerCudaSync; for(int i = 0; i < loops; ++i) { timerHost.start(); computeHost(); timerHost.stop(); timerCuda.start(); computeCuda(); timerCuda.stop(); if( ! check() ) throw BenchmarkError(); reset(); // Compute again on CUDA, with explicit synchronization #ifdef HAVE_CUDA cudaDeviceSynchronize(); timerCudaSync.start(); computeCuda(); cudaDeviceSynchronize(); timerCudaSync.stop(); #endif timeFunction( compute, reset, loops, datasetSize, baseTime, performer ); } reset(); // Recursive template function to deal with benchmarks involving multiple computations template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > inline void benchmarkNextOperation( const double & datasetSize, const int & loops, ResetFunction reset, const double & baseTime, const char* performer, ComputeFunction compute, NextComputations & ... nextComputations ) { benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double timeCudaSync = timerCudaSync.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; const double bandwidthCudaSync = datasetSize / timeCudaSync; std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl; std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl; std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl; std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl; std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl; // Main function for benchmarking template< typename ResetFunction, typename ComputeFunction, typename... NextComputations > void benchmarkOperation( const char* operation, const double & datasetSize, const int & loops, ResetFunction reset, const char* performer, ComputeFunction computeBase, NextComputations... nextComputations ) { cout << "Benchmarking " << operation << ":" << endl; double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer ); benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... ); std::cout << std::endl; } Loading
tests/benchmarks/tnl-cuda-benchmarks.h +10 −8 Original line number Diff line number Diff line Loading @@ -29,6 +29,8 @@ using namespace tnl::benchmarks; // TODO: should benchmarks check the result of the computation? // silly alias to match the number of template parameters with other formats template< typename Real, typename Device, typename Index > using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; Loading Loading @@ -147,19 +149,17 @@ benchmarkSpMV( const int & loops, tnlList< tnlString > parsedType; parseObjectType( HostMatrix::getType(), parsedType ); cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl; tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ] + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")"; const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow ); setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow ); const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB; hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); // check and reset functions auto check = [&]() { return hostVector2 == deviceVector2; }; // reset function auto reset = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); hostVector2.setValue( 0.0 ); deviceVector2.setValue( 0.0 ); }; Loading @@ -172,7 +172,9 @@ benchmarkSpMV( const int & loops, deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); benchmarkOperation( operationDescription.getString(), 2 * datasetSize, loops, reset, "CPU", spmvHost, "GPU", spmvCuda ); return true; } Loading
tests/benchmarks/vector-operations.h +44 −51 Original line number Diff line number Diff line Loading @@ -37,21 +37,10 @@ benchmarkVectorOperations( const int & loops, Real resultHost, resultDevice; // check functions auto compare1 = [&]() { return hostVector == deviceVector; }; auto compare2 = [&]() { return hostVector2 == deviceVector2; }; auto compare12 = [&]() { return compare1() && compare2(); }; auto compareScalars = [&]() { return resultHost == resultDevice; }; // reset functions // (Make sure to always use some in benchmarks, even if it's not necessary // to assure correct result - it helps to clear cache and avoid optimizations // of the benchmark loop.) auto reset1 = [&]() { hostVector.setValue( 1.0 ); deviceVector.setValue( 1.0 ); Loading @@ -68,148 +57,152 @@ benchmarkVectorOperations( const int & loops, reset12(); cout << "Benchmarking CPU-CPU memory transfer:" << endl; auto copyAssignHostHost = [&]() { hostVector = hostVector2; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignHostHost, trueFunc, reset1 ); cout << "Benchmarking CPU-GPU memory transfer:" << endl; auto copyAssignHostCuda = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignHostCuda, compare1, reset1 ); cout << "Benchmarking GPU-GPU memory transfer:" << endl; auto copyAssignCudaCuda = [&]() { deviceVector = hostVector; }; cout << " "; benchmarkSingle( loops, datasetSize, copyAssignCudaCuda, trueFunc, reset1 ); cout << endl; benchmarkOperation( "copy assigment", datasetSize, loops, reset1, "CPU->CPU", copyAssignHostHost, "CPU->GPU", copyAssignHostCuda, "GPU->GPU", copyAssignCudaCuda ); cout << "Benchmarking tnlVector.operator==" << endl; auto compareHost = [&]() { resultHost = (int) hostVector == hostVector2; }; auto compareCuda = [&]() { resultDevice = (int) deviceVector == deviceVector2; }; benchmarkCuda( loops, 2 * datasetSize, compareHost, compareCuda, compareScalars, voidFunc ); benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1, "CPU", compareHost, "GPU", compareCuda ); cout << "Benchmarking scalar multiplication:" << endl; auto multiplyHost = [&]() { hostVector *= 0.5; }; auto multiplyCuda = [&]() { deviceVector *= 0.5; }; benchmarkCuda( loops, 2 * datasetSize, multiplyHost, multiplyCuda, compare1, reset1 ); benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1, "CPU", multiplyHost, "GPU", multiplyCuda ); cout << "Benchmarking vector addition:" << endl; auto addVectorHost = [&]() { hostVector.addVector( hostVector2 ); }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1, "CPU", addVectorHost, "GPU", addVectorCuda ); cout << "Benchmarking max:" << endl; auto maxHost = [&]() { resultHost = hostVector.max(); }; auto maxCuda = [&]() { resultDevice = deviceVector.max(); }; benchmarkCuda( loops, datasetSize, maxHost, maxCuda, compareScalars, voidFunc ); benchmarkOperation( "max", datasetSize, loops, reset1, "CPU", maxHost, "GPU", maxCuda ); cout << "Benchmarking min:" << endl; auto minHost = [&]() { resultHost = hostVector.min(); }; auto minCuda = [&]() { resultDevice = deviceVector.min(); }; benchmarkCuda( loops, datasetSize, minHost, minCuda, compareScalars, voidFunc ); benchmarkOperation( "min", datasetSize, loops, reset1, "CPU", minHost, "GPU", minCuda ); cout << "Benchmarking absMax:" << endl; auto absMaxHost = [&]() { resultHost = hostVector.absMax(); }; auto absMaxCuda = [&]() { resultDevice = deviceVector.absMax(); }; benchmarkCuda( loops, datasetSize, absMaxHost, absMaxCuda, compareScalars, voidFunc ); benchmarkOperation( "absMax", datasetSize, loops, reset1, "CPU", absMaxHost, "GPU", absMaxCuda ); cout << "Benchmarking absMin:" << endl; auto absMinHost = [&]() { resultHost = hostVector.absMin(); }; auto absMinCuda = [&]() { resultDevice = deviceVector.absMin(); }; benchmarkCuda( loops, datasetSize, absMinHost, absMinCuda, compareScalars, voidFunc ); benchmarkOperation( "absMin", datasetSize, loops, reset1, "CPU", absMinHost, "GPU", absMinCuda ); cout << "Benchmarking sum:" << endl; auto sumHost = [&]() { resultHost = hostVector.sum(); }; auto sumCuda = [&]() { resultDevice = deviceVector.sum(); }; benchmarkCuda( loops, datasetSize, sumHost, sumCuda, compareScalars, voidFunc ); benchmarkOperation( "sum", datasetSize, loops, reset1, "CPU", sumHost, "GPU", sumCuda ); cout << "Benchmarking l1 norm: " << endl; auto l1normHost = [&]() { resultHost = hostVector.lpNorm( 1.0 ); }; auto l1normCuda = [&]() { resultDevice = deviceVector.lpNorm( 1.0 ); }; benchmarkCuda( loops, datasetSize, l1normHost, l1normCuda, compareScalars, voidFunc ); benchmarkOperation( "l1 norm", datasetSize, loops, reset1, "CPU", l1normHost, "GPU", l1normCuda ); cout << "Benchmarking l2 norm: " << endl; auto l2normHost = [&]() { resultHost = hostVector.lpNorm( 2.0 ); }; auto l2normCuda = [&]() { resultDevice = deviceVector.lpNorm( 2.0 ); }; benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc ); benchmarkOperation( "l2 norm", datasetSize, loops, reset1, "CPU", l2normHost, "GPU", l2normCuda ); cout << "Benchmarking l3 norm: " << endl; auto l3normHost = [&]() { resultHost = hostVector.lpNorm( 3.0 ); }; auto l3normCuda = [&]() { resultDevice = deviceVector.lpNorm( 3.0 ); }; benchmarkCuda( loops, datasetSize, l3normHost, l3normCuda, compareScalars, voidFunc ); benchmarkOperation( "l3 norm", datasetSize, loops, reset1, "CPU", l3normHost, "GPU", l3normCuda ); cout << "Benchmarking scalar product:" << endl; auto scalarProductHost = [&]() { resultHost = hostVector.scalarProduct( hostVector2 ); }; auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc ); benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda ); /* TODO #ifdef HAVE_CUBLAS Loading