Benchmarks: refactoring, generalization for other operations (7d4d00e7) · Commits · TNL / tnl-dev

tests/benchmarks/benchmarks.h

+70 −66

Original line number	Diff line number	Diff line
		#pragma once

		#include <iostream>
		#include <iomanip>

		#include <core/tnlTimerRT.h>

		@@ -11,95 +12,98 @@ namespace benchmarks

		const double oneGB = 1024.0 * 1024.0 * 1024.0;

		// TODO: add data member for error message
		struct BenchmarkError {};

		auto trueFunc = []() { return true; };
		auto voidFunc = [](){};

		template< typename ComputeFunction,
		typename CheckFunction,
		typename ResetFunction >
		double
		benchmarkSingle( const int & loops,
		timeFunction( ComputeFunction compute,
		ResetFunction reset,
		const int & loops,
		const double & datasetSize, // in GB
		ComputeFunction compute,
		CheckFunction check,
		ResetFunction reset )
		const double & baseTime, // in seconds (baseline for speedup calculation)
		const char* performer )
		{
		// the timer is constructed zero-initialized and stopped
		tnlTimerRT timer;
		timer.reset();

		reset();
		for(int i = 0; i < loops; ++i) {
		// TODO: not necessary for host computations
		// Explicit synchronization of the CUDA device
		#ifdef HAVE_CUDA
		cudaDeviceSynchronize();
		#endif
		timer.start();
		compute();
		#ifdef HAVE_CUDA
		cudaDeviceSynchronize();
		#endif
		timer.stop();

		if( ! check() )
		throw BenchmarkError();

		reset();
		}

		const double time = timer.getTime();
		const double bandwidth = datasetSize / time;
		std::cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << std::endl;

		using namespace std;
		cout << " " << performer << ": bandwidth: "
		<< setw( 8 ) << bandwidth << " GB/sec, time: "
		<< setw( 8 ) << time << " sec, speedup: ";
		if( baseTime )
		cout << baseTime / time << endl;
		else
		cout << "N/A" << endl;

		return time;
		}

		template< typename ComputeHostFunction,
		typename ComputeCudaFunction,
		typename CheckFunction,
		typename ResetFunction >
		void
		benchmarkCuda( const int & loops,
		const double & datasetSize, // in GB
		ComputeHostFunction computeHost,
		ComputeCudaFunction computeCuda,
		CheckFunction check,
		ResetFunction reset )
		// This specialization terminates the recursion
		template< typename ResetFunction,
		typename ComputeFunction >
		inline void
		benchmarkNextOperation( const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const double & baseTime,
		const char* performer,
		ComputeFunction compute )
		{
		// timers are constructed zero-initialized and stopped
		tnlTimerRT timerHost, timerCuda, timerCudaSync;

		for(int i = 0; i < loops; ++i) {
		timerHost.start();
		computeHost();
		timerHost.stop();

		timerCuda.start();
		computeCuda();
		timerCuda.stop();

		if( ! check() )
		throw BenchmarkError();

		reset();

		// Compute again on CUDA, with explicit synchronization
		#ifdef HAVE_CUDA
		cudaDeviceSynchronize();
		timerCudaSync.start();
		computeCuda();
		cudaDeviceSynchronize();
		timerCudaSync.stop();
		#endif
		timeFunction( compute, reset, loops, datasetSize, baseTime, performer );
		}

		reset();
		// Recursive template function to deal with benchmarks involving multiple computations
		template< typename ResetFunction,
		typename ComputeFunction,
		typename... NextComputations >
		inline void
		benchmarkNextOperation( const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const double & baseTime,
		const char* performer,
		ComputeFunction compute,
		NextComputations & ... nextComputations )
		{
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute );
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
		}

		const double timeHost = timerHost.getTime();
		const double timeCuda = timerCuda.getTime();
		const double timeCudaSync = timerCudaSync.getTime();
		const double bandwidthHost = datasetSize / timeHost;
		const double bandwidthCuda = datasetSize / timeCuda;
		const double bandwidthCudaSync = datasetSize / timeCudaSync;
		std::cout << " CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl;
		std::cout << " GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl;
		std::cout << " GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl;
		std::cout << " CPU/GPU speedup: " << timeHost / timeCuda << std::endl;
		std::cout << " CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl;
		// Main function for benchmarking
		template< typename ResetFunction,
		typename ComputeFunction,
		typename... NextComputations >
		void
		benchmarkOperation( const char* operation,
		const double & datasetSize,
		const int & loops,
		ResetFunction reset,
		const char* performer,
		ComputeFunction computeBase,
		NextComputations... nextComputations )
		{
		cout << "Benchmarking " << operation << ":" << endl;
		double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer );
		benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
		std::cout << std::endl;
		}

tests/benchmarks/tnl-cuda-benchmarks.h

+10 −8

Original line number	Diff line number	Diff line
		@@ -29,6 +29,8 @@
		using namespace tnl::benchmarks;


		// TODO: should benchmarks check the result of the computation?

		// silly alias to match the number of template parameters with other formats
		template< typename Real, typename Device, typename Index >
		using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
		@@ -147,19 +149,17 @@ benchmarkSpMV( const int & loops,

		tnlList< tnlString > parsedType;
		parseObjectType( HostMatrix::getType(), parsedType );
		cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl;
		tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ]
		+ ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")";

		const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
		setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
		const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
		hostVector.setValue( 1.0 );
		deviceVector.setValue( 1.0 );

		// check and reset functions
		auto check = [&]() {
		return hostVector2 == deviceVector2;
		};
		// reset function
		auto reset = [&]() {
		hostVector.setValue( 1.0 );
		deviceVector.setValue( 1.0 );
		hostVector2.setValue( 0.0 );
		deviceVector2.setValue( 0.0 );
		};
		@@ -172,7 +172,9 @@ benchmarkSpMV( const int & loops,
		deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
		};

		benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset );
		benchmarkOperation( operationDescription.getString(), 2 * datasetSize, loops, reset,
		"CPU", spmvHost,
		"GPU", spmvCuda );

		return true;
		}

tests/benchmarks/vector-operations.h

+44 −51

Original line number	Diff line number	Diff line
		@@ -37,21 +37,10 @@ benchmarkVectorOperations( const int & loops,
		Real resultHost, resultDevice;


		// check functions
		auto compare1 = [&]() {
		return hostVector == deviceVector;
		};
		auto compare2 = [&]() {
		return hostVector2 == deviceVector2;
		};
		auto compare12 = [&]() {
		return compare1() && compare2();
		};
		auto compareScalars = [&]() {
		return resultHost == resultDevice;
		};

		// reset functions
		// (Make sure to always use some in benchmarks, even if it's not necessary
		// to assure correct result - it helps to clear cache and avoid optimizations
		// of the benchmark loop.)
		auto reset1 = [&]() {
		hostVector.setValue( 1.0 );
		deviceVector.setValue( 1.0 );
		@@ -68,148 +57,152 @@ benchmarkVectorOperations( const int & loops,

		reset12();

		cout << "Benchmarking CPU-CPU memory transfer:" << endl;

		auto copyAssignHostHost = [&]() {
		hostVector = hostVector2;
		};
		cout << " ";
		benchmarkSingle( loops, datasetSize, copyAssignHostHost, trueFunc, reset1 );

		cout << "Benchmarking CPU-GPU memory transfer:" << endl;
		auto copyAssignHostCuda = [&]() {
		deviceVector = hostVector;
		};
		cout << " ";
		benchmarkSingle( loops, datasetSize, copyAssignHostCuda, compare1, reset1 );

		cout << "Benchmarking GPU-GPU memory transfer:" << endl;
		auto copyAssignCudaCuda = [&]() {
		deviceVector = hostVector;
		};
		cout << " ";
		benchmarkSingle( loops, datasetSize, copyAssignCudaCuda, trueFunc, reset1 );

		cout << endl;
		benchmarkOperation( "copy assigment", datasetSize, loops, reset1,
		"CPU->CPU", copyAssignHostHost,
		"CPU->GPU", copyAssignHostCuda,
		"GPU->GPU", copyAssignCudaCuda );


		cout << "Benchmarking tnlVector.operator==" << endl;
		auto compareHost = [&]() {
		resultHost = (int) hostVector == hostVector2;
		};
		auto compareCuda = [&]() {
		resultDevice = (int) deviceVector == deviceVector2;
		};
		benchmarkCuda( loops, 2 * datasetSize, compareHost, compareCuda, compareScalars, voidFunc );
		benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1,
		"CPU", compareHost,
		"GPU", compareCuda );


		cout << "Benchmarking scalar multiplication:" << endl;
		auto multiplyHost = [&]() {
		hostVector *= 0.5;
		};
		auto multiplyCuda = [&]() {
		deviceVector *= 0.5;
		};
		benchmarkCuda( loops, 2 * datasetSize, multiplyHost, multiplyCuda, compare1, reset1 );
		benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1,
		"CPU", multiplyHost,
		"GPU", multiplyCuda );


		cout << "Benchmarking vector addition:" << endl;
		auto addVectorHost = [&]() {
		hostVector.addVector( hostVector2 );
		};
		auto addVectorCuda = [&]() {
		deviceVector.addVector( deviceVector2 );
		};
		benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
		benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1,
		"CPU", addVectorHost,
		"GPU", addVectorCuda );


		cout << "Benchmarking max:" << endl;
		auto maxHost = [&]() {
		resultHost = hostVector.max();
		};
		auto maxCuda = [&]() {
		resultDevice = deviceVector.max();
		};
		benchmarkCuda( loops, datasetSize, maxHost, maxCuda, compareScalars, voidFunc );
		benchmarkOperation( "max", datasetSize, loops, reset1,
		"CPU", maxHost,
		"GPU", maxCuda );


		cout << "Benchmarking min:" << endl;
		auto minHost = [&]() {
		resultHost = hostVector.min();
		};
		auto minCuda = [&]() {
		resultDevice = deviceVector.min();
		};
		benchmarkCuda( loops, datasetSize, minHost, minCuda, compareScalars, voidFunc );
		benchmarkOperation( "min", datasetSize, loops, reset1,
		"CPU", minHost,
		"GPU", minCuda );


		cout << "Benchmarking absMax:" << endl;
		auto absMaxHost = [&]() {
		resultHost = hostVector.absMax();
		};
		auto absMaxCuda = [&]() {
		resultDevice = deviceVector.absMax();
		};
		benchmarkCuda( loops, datasetSize, absMaxHost, absMaxCuda, compareScalars, voidFunc );
		benchmarkOperation( "absMax", datasetSize, loops, reset1,
		"CPU", absMaxHost,
		"GPU", absMaxCuda );


		cout << "Benchmarking absMin:" << endl;
		auto absMinHost = [&]() {
		resultHost = hostVector.absMin();
		};
		auto absMinCuda = [&]() {
		resultDevice = deviceVector.absMin();
		};
		benchmarkCuda( loops, datasetSize, absMinHost, absMinCuda, compareScalars, voidFunc );
		benchmarkOperation( "absMin", datasetSize, loops, reset1,
		"CPU", absMinHost,
		"GPU", absMinCuda );


		cout << "Benchmarking sum:" << endl;
		auto sumHost = [&]() {
		resultHost = hostVector.sum();
		};
		auto sumCuda = [&]() {
		resultDevice = deviceVector.sum();
		};
		benchmarkCuda( loops, datasetSize, sumHost, sumCuda, compareScalars, voidFunc );
		benchmarkOperation( "sum", datasetSize, loops, reset1,
		"CPU", sumHost,
		"GPU", sumCuda );


		cout << "Benchmarking l1 norm: " << endl;
		auto l1normHost = [&]() {
		resultHost = hostVector.lpNorm( 1.0 );
		};
		auto l1normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 1.0 );
		};
		benchmarkCuda( loops, datasetSize, l1normHost, l1normCuda, compareScalars, voidFunc );
		benchmarkOperation( "l1 norm", datasetSize, loops, reset1,
		"CPU", l1normHost,
		"GPU", l1normCuda );


		cout << "Benchmarking l2 norm: " << endl;
		auto l2normHost = [&]() {
		resultHost = hostVector.lpNorm( 2.0 );
		};
		auto l2normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 2.0 );
		};
		benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
		benchmarkOperation( "l2 norm", datasetSize, loops, reset1,
		"CPU", l2normHost,
		"GPU", l2normCuda );


		cout << "Benchmarking l3 norm: " << endl;
		auto l3normHost = [&]() {
		resultHost = hostVector.lpNorm( 3.0 );
		};
		auto l3normCuda = [&]() {
		resultDevice = deviceVector.lpNorm( 3.0 );
		};
		benchmarkCuda( loops, datasetSize, l3normHost, l3normCuda, compareScalars, voidFunc );
		benchmarkOperation( "l3 norm", datasetSize, loops, reset1,
		"CPU", l3normHost,
		"GPU", l3normCuda );


		cout << "Benchmarking scalar product:" << endl;
		auto scalarProductHost = [&]() {
		resultHost = hostVector.scalarProduct( hostVector2 );
		};
		auto scalarProductCuda = [&]() {
		resultDevice = deviceVector.scalarProduct( deviceVector2 );
		};
		benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );
		benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
		"CPU", scalarProductHost,
		"GPU", scalarProductCuda );

		/* TODO
		#ifdef HAVE_CUBLAS