Loading tests/benchmarks/CMakeLists.txt +3 −0 Original line number Diff line number Diff line Loading @@ -2,6 +2,9 @@ ADD_SUBDIRECTORY( share ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu ) if( WITH_CUBLAS STREQUAL "yes" ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} ) endif() TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu ) Loading tests/benchmarks/cublasWrappers.h 0 → 100644 +27 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #ifdef HAVE_CUBLAS #include <cublas_v2.h> inline cublasStatus_t cublasGdot( cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result ) { return cublasSdot( handle, n, x, incx, y, incy, result ); } inline cublasStatus_t cublasGdot( cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result ) { return cublasDdot( handle, n, x, incx, y, incy, result ); } #endif #endif tests/benchmarks/vector-operations.h +21 −20 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ #include <core/vectors/tnlVector.h> #ifdef HAVE_CUBLAS //#include <cublas.h> #include "cublasWrappers.h" #endif namespace tnl Loading Loading @@ -36,6 +36,11 @@ benchmarkVectorOperations( const int & loops, Real resultHost, resultDevice; #ifdef HAVE_CUBLAS cublasHandle_t cublasHandle; cublasCreate( &cublasHandle ); #endif // reset functions // (Make sure to always use some in benchmarks, even if it's not necessary Loading Loading @@ -200,29 +205,21 @@ benchmarkVectorOperations( const int & loops, auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, auto scalarProductCublas = [&]() { cublasGdot( cublasHandle, size, deviceVector.getData(), 1, deviceVector2.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; }; #endif */ benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda #ifdef HAVE_CUBLAS , "cuBLAS", scalarProductCublas #endif ); /* cout << "Benchmarking prefix-sum:" << endl; Loading Loading @@ -253,6 +250,10 @@ benchmarkVectorOperations( const int & loops, } */ #ifdef HAVE_CUBLAS cublasDestroy( cublasHandle ); #endif return true; } Loading Loading
tests/benchmarks/CMakeLists.txt +3 −0 Original line number Diff line number Diff line Loading @@ -2,6 +2,9 @@ ADD_SUBDIRECTORY( share ) IF( BUILD_CUDA ) CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu ) if( WITH_CUBLAS STREQUAL "yes" ) CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} ) endif() TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} ) CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu ) Loading
tests/benchmarks/cublasWrappers.h 0 → 100644 +27 −0 Original line number Diff line number Diff line #pragma once #ifdef HAVE_CUDA #ifdef HAVE_CUBLAS #include <cublas_v2.h> inline cublasStatus_t cublasGdot( cublasHandle_t handle, int n, const float *x, int incx, const float *y, int incy, float *result ) { return cublasSdot( handle, n, x, incx, y, incy, result ); } inline cublasStatus_t cublasGdot( cublasHandle_t handle, int n, const double *x, int incx, const double *y, int incy, double *result ) { return cublasDdot( handle, n, x, incx, y, incy, result ); } #endif #endif
tests/benchmarks/vector-operations.h +21 −20 Original line number Diff line number Diff line Loading @@ -5,7 +5,7 @@ #include <core/vectors/tnlVector.h> #ifdef HAVE_CUBLAS //#include <cublas.h> #include "cublasWrappers.h" #endif namespace tnl Loading Loading @@ -36,6 +36,11 @@ benchmarkVectorOperations( const int & loops, Real resultHost, resultDevice; #ifdef HAVE_CUBLAS cublasHandle_t cublasHandle; cublasCreate( &cublasHandle ); #endif // reset functions // (Make sure to always use some in benchmarks, even if it's not necessary Loading Loading @@ -200,29 +205,21 @@ benchmarkVectorOperations( const int & loops, auto scalarProductCuda = [&]() { resultDevice = deviceVector.scalarProduct( deviceVector2 ); }; benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda ); /* TODO #ifdef HAVE_CUBLAS cout << "Benchmarking scalar product on GPU with Cublas: " << endl; cublasHandle_t handle; cublasCreate( &handle ); timer.reset(); timer.start(); for( int i = 0; i < loops; i++ ) cublasDdot( handle, size, deviceVector.getData(), 1, auto scalarProductCublas = [&]() { cublasGdot( cublasHandle, size, deviceVector.getData(), 1, deviceVector2.getData(), 1, &resultDevice ); cudaThreadSynchronize(); timer.stop(); bandwidth = 2 * datasetSize / timer.getTime(); cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl; }; #endif */ benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1, "CPU", scalarProductHost, "GPU", scalarProductCuda #ifdef HAVE_CUBLAS , "cuBLAS", scalarProductCublas #endif ); /* cout << "Benchmarking prefix-sum:" << endl; Loading Loading @@ -253,6 +250,10 @@ benchmarkVectorOperations( const int & loops, } */ #ifdef HAVE_CUBLAS cublasDestroy( cublasHandle ); #endif return true; } Loading