Added cuBLAS benchmarks for scalar product (7b7a4f94) · Commits · TNL / tnl-dev

tests/benchmarks/CMakeLists.txt

+3 −0

Original line number	Diff line number	Diff line
		@@ -2,6 +2,9 @@ ADD_SUBDIRECTORY( share )

		IF( BUILD_CUDA )
		CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu )
		if( WITH_CUBLAS STREQUAL "yes" )
		CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} )
		endif()
		TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )

		CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu )

tests/benchmarks/cublasWrappers.h

0 → 100644

+27 −0

Original line number	Diff line number	Diff line
		#pragma once

		#ifdef HAVE_CUDA
		#ifdef HAVE_CUBLAS

		#include <cublas_v2.h>

		inline cublasStatus_t
		cublasGdot( cublasHandle_t handle, int n,
		const float *x, int incx,
		const float *y, int incy,
		float *result )
		{
		return cublasSdot( handle, n, x, incx, y, incy, result );
		}

		inline cublasStatus_t
		cublasGdot( cublasHandle_t handle, int n,
		const double *x, int incx,
		const double *y, int incy,
		double *result )
		{
		return cublasDdot( handle, n, x, incx, y, incy, result );
		}

		#endif
		#endif

tests/benchmarks/vector-operations.h

+21 −20

Original line number	Diff line number	Diff line
		@@ -5,7 +5,7 @@
		#include <core/vectors/tnlVector.h>

		#ifdef HAVE_CUBLAS
		//#include <cublas.h>
		#include "cublasWrappers.h"
		#endif

		namespace tnl
		@@ -36,6 +36,11 @@ benchmarkVectorOperations( const int & loops,

		Real resultHost, resultDevice;

		#ifdef HAVE_CUBLAS
		cublasHandle_t cublasHandle;
		cublasCreate( &cublasHandle );
		#endif


		// reset functions
		// (Make sure to always use some in benchmarks, even if it's not necessary
		@@ -200,29 +205,21 @@ benchmarkVectorOperations( const int & loops,
		auto scalarProductCuda = [&]() {
		resultDevice = deviceVector.scalarProduct( deviceVector2 );
		};
		benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
		"CPU", scalarProductHost,
		"GPU", scalarProductCuda );

		/* TODO
		#ifdef HAVE_CUBLAS
		cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
		cublasHandle_t handle;
		cublasCreate( &handle );
		timer.reset();
		timer.start();
		for( int i = 0; i < loops; i++ )
		cublasDdot( handle,
		size,
		deviceVector.getData(), 1,
		auto scalarProductCublas = [&]() {
		cublasGdot( cublasHandle, size,
		deviceVector.getData(), 1,
		deviceVector2.getData(), 1,
		&resultDevice );
		cudaThreadSynchronize();
		timer.stop();
		bandwidth = 2 * datasetSize / timer.getTime();
		cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
		};
		#endif
		*/
		benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
		"CPU", scalarProductHost,
		"GPU", scalarProductCuda
		#ifdef HAVE_CUBLAS
		, "cuBLAS", scalarProductCublas
		#endif
		);

		/*
		cout << "Benchmarking prefix-sum:" << endl;
		@@ -253,6 +250,10 @@ benchmarkVectorOperations( const int & loops,
		}
		*/

		#ifdef HAVE_CUBLAS
		cublasDestroy( cublasHandle );
		#endif

		return true;
		}