Loading tests/benchmarks/benchmarks.h +16 −1 Original line number Diff line number Diff line Loading @@ -62,11 +62,13 @@ benchmarkCuda( const int & loops, CheckFunction check = trueFunc, ResetFunction reset = voidFunc ) { tnlTimerRT timerHost, timerCuda; tnlTimerRT timerHost, timerCuda, timerCudaSync; timerHost.reset(); timerHost.stop(); timerCuda.reset(); timerCuda.stop(); timerCudaSync.reset(); timerCudaSync.stop(); for(int i = 0; i < loops; ++i) { timerHost.start(); Loading @@ -81,15 +83,28 @@ benchmarkCuda( const int & loops, throw BenchmarkError(); reset(); // Compute again on CUDA, with explicit synchronization of threads timerCudaSync.start(); computeCuda(); cudaThreadSynchronize(); timerCudaSync.stop(); reset(); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double timeCudaSync = timerCudaSync.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; const double bandwidthCudaSync = datasetSize / timeCudaSync; std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl; std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl; std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl; std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl; std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl; std::cout << std::endl; } } // namespace benchmarks Loading tests/benchmarks/tnl-cuda-benchmarks.h +3 −3 Original line number Diff line number Diff line Loading @@ -43,7 +43,9 @@ using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; // - reset() clears the timer and starts it again // - getTime() stops the timer and starts it again !!! // - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer // FIXME: scalarProduct is not const method // FIXME: // - scalarProduct is not const method // - cudaThreadSynchronize() should be called from all CUDA methods template< typename Matrix > Loading Loading @@ -183,8 +185,6 @@ benchmarkSpMV( const int & loops, }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); // TODO: tnlCSRMatrix does not synchronize cudaThreadSynchronize(); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); Loading tests/benchmarks/vector-operations.h +0 −2 Original line number Diff line number Diff line Loading @@ -78,8 +78,6 @@ benchmarkVectorOperations( const int & loops, }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); Loading Loading
tests/benchmarks/benchmarks.h +16 −1 Original line number Diff line number Diff line Loading @@ -62,11 +62,13 @@ benchmarkCuda( const int & loops, CheckFunction check = trueFunc, ResetFunction reset = voidFunc ) { tnlTimerRT timerHost, timerCuda; tnlTimerRT timerHost, timerCuda, timerCudaSync; timerHost.reset(); timerHost.stop(); timerCuda.reset(); timerCuda.stop(); timerCudaSync.reset(); timerCudaSync.stop(); for(int i = 0; i < loops; ++i) { timerHost.start(); Loading @@ -81,15 +83,28 @@ benchmarkCuda( const int & loops, throw BenchmarkError(); reset(); // Compute again on CUDA, with explicit synchronization of threads timerCudaSync.start(); computeCuda(); cudaThreadSynchronize(); timerCudaSync.stop(); reset(); } const double timeHost = timerHost.getTime(); const double timeCuda = timerCuda.getTime(); const double timeCudaSync = timerCudaSync.getTime(); const double bandwidthHost = datasetSize / timeHost; const double bandwidthCuda = datasetSize / timeCuda; const double bandwidthCudaSync = datasetSize / timeCudaSync; std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl; std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl; std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl; std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl; std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl; std::cout << std::endl; } } // namespace benchmarks Loading
tests/benchmarks/tnl-cuda-benchmarks.h +3 −3 Original line number Diff line number Diff line Loading @@ -43,7 +43,9 @@ using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >; // - reset() clears the timer and starts it again // - getTime() stops the timer and starts it again !!! // - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer // FIXME: scalarProduct is not const method // FIXME: // - scalarProduct is not const method // - cudaThreadSynchronize() should be called from all CUDA methods template< typename Matrix > Loading Loading @@ -183,8 +185,6 @@ benchmarkSpMV( const int & loops, }; auto spmvCuda = [&]() { deviceMatrix.vectorProduct( deviceVector, deviceVector2 ); // TODO: tnlCSRMatrix does not synchronize cudaThreadSynchronize(); }; benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset ); Loading
tests/benchmarks/vector-operations.h +0 −2 Original line number Diff line number Diff line Loading @@ -78,8 +78,6 @@ benchmarkVectorOperations( const int & loops, }; auto addVectorCuda = [&]() { deviceVector.addVector( deviceVector2 ); // TODO: synchronization should be part of addVector cudaThreadSynchronize(); }; benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 ); Loading