Benchmarks: add a case with explicit cudaThreadsSynchronize() call (5b36e3c1) · Commits · TNL / tnl-dev

tests/benchmarks/benchmarks.h

+16 −1

Original line number	Diff line number	Diff line
		@@ -62,11 +62,13 @@ benchmarkCuda( const int & loops,
		CheckFunction check = trueFunc,
		ResetFunction reset = voidFunc )
		{
		tnlTimerRT timerHost, timerCuda;
		tnlTimerRT timerHost, timerCuda, timerCudaSync;
		timerHost.reset();
		timerHost.stop();
		timerCuda.reset();
		timerCuda.stop();
		timerCudaSync.reset();
		timerCudaSync.stop();

		for(int i = 0; i < loops; ++i) {
		timerHost.start();
		@@ -81,15 +83,28 @@ benchmarkCuda( const int & loops,
		throw BenchmarkError();

		reset();

		// Compute again on CUDA, with explicit synchronization of threads
		timerCudaSync.start();
		computeCuda();
		cudaThreadSynchronize();
		timerCudaSync.stop();

		reset();
		}

		const double timeHost = timerHost.getTime();
		const double timeCuda = timerCuda.getTime();
		const double timeCudaSync = timerCudaSync.getTime();
		const double bandwidthHost = datasetSize / timeHost;
		const double bandwidthCuda = datasetSize / timeCuda;
		const double bandwidthCudaSync = datasetSize / timeCudaSync;
		std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl;
		std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl;
		std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl;
		std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl;
		std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl;
		std::cout << std::endl;
		}

		} // namespace benchmarks

+3 −3

Original line number	Diff line number	Diff line
		@@ -43,7 +43,9 @@ using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
		// - reset() clears the timer and starts it again
		// - getTime() stops the timer and starts it again !!!
		// - data members are not zero-initialized - reset has to be called manually, but it immediately starts the timer
		// FIXME: scalarProduct is not const method
		// FIXME:
		// - scalarProduct is not const method
		// - cudaThreadSynchronize() should be called from all CUDA methods


		template< typename Matrix >
		@@ -183,8 +185,6 @@ benchmarkSpMV( const int & loops,
		};
		auto spmvCuda = [&]() {
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		// TODO: tnlCSRMatrix does not synchronize
		cudaThreadSynchronize();
		};

		benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset );

+0 −2

Original line number	Diff line number	Diff line
		@@ -78,8 +78,6 @@ benchmarkVectorOperations( const int & loops,
		};
		auto addVectorCuda = [&]() {
		deviceVector.addVector( deviceVector2 );
		// TODO: synchronization should be part of addVector
		cudaThreadSynchronize();
		};
		benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );