Commit 7b7a4f94 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Added cuBLAS benchmarks for scalar product

parent 88ac33c7
Loading
Loading
Loading
Loading
+3 −0
Original line number Diff line number Diff line
@@ -2,6 +2,9 @@ ADD_SUBDIRECTORY( share )

IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu )
    if( WITH_CUBLAS STREQUAL "yes" )
        CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} )
    endif()
    TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
    
    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu )
+27 −0
Original line number Diff line number Diff line
#pragma once

#ifdef HAVE_CUDA
#ifdef HAVE_CUBLAS

#include <cublas_v2.h>

inline cublasStatus_t
cublasGdot( cublasHandle_t handle, int n,
            const float        *x, int incx,
            const float        *y, int incy,
            float         *result )
{
    return cublasSdot( handle, n, x, incx, y, incy, result );
}

inline cublasStatus_t
cublasGdot( cublasHandle_t handle, int n,
            const double       *x, int incx,
            const double       *y, int incy,
            double        *result )
{
    return cublasDdot( handle, n, x, incx, y, incy, result );
}

#endif
#endif
+21 −20
Original line number Diff line number Diff line
@@ -5,7 +5,7 @@
#include <core/vectors/tnlVector.h>

#ifdef HAVE_CUBLAS
//#include <cublas.h>
#include "cublasWrappers.h"
#endif

namespace tnl
@@ -36,6 +36,11 @@ benchmarkVectorOperations( const int & loops,

    Real resultHost, resultDevice;

#ifdef HAVE_CUBLAS
    cublasHandle_t cublasHandle;
    cublasCreate( &cublasHandle );
#endif


    // reset functions
    // (Make sure to always use some in benchmarks, even if it's not necessary
@@ -200,29 +205,21 @@ benchmarkVectorOperations( const int & loops,
    auto scalarProductCuda = [&]() {
        resultDevice = deviceVector.scalarProduct( deviceVector2 );
    };
    benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
                        "CPU", scalarProductHost,
                        "GPU", scalarProductCuda );

/* TODO
#ifdef HAVE_CUBLAS
   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
   cublasHandle_t handle;
   cublasCreate( &handle );
   timer.reset();
   timer.start();
   for( int i = 0; i < loops; i++ )
      cublasDdot( handle,
                  size,
                  deviceVector.getData(), 1,
    auto scalarProductCublas = [&]() {
        cublasGdot( cublasHandle, size,
                    deviceVector.getData(), 1,
                    deviceVector2.getData(), 1,
                    &resultDevice );
   cudaThreadSynchronize();
   timer.stop();
   bandwidth = 2 * datasetSize / timer.getTime();
   cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
    };
#endif
*/
    benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
                        "CPU", scalarProductHost,
                        "GPU", scalarProductCuda
#ifdef HAVE_CUBLAS
                      , "cuBLAS", scalarProductCublas
#endif
                      );

    /*
    cout << "Benchmarking prefix-sum:" << endl;
@@ -253,6 +250,10 @@ benchmarkVectorOperations( const int & loops,
       }
    */

#ifdef HAVE_CUBLAS
    cublasDestroy( cublasHandle );
#endif

    return true;
}