Commit 75b18f59 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactoring BLAS benchmarks

parent 9e6fe9a8
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -44,8 +44,8 @@ endif()
# set Debug/Release options
set( CMAKE_CXX_FLAGS "-std=c++11 -Wall -Wno-unused-local-typedefs -Wno-unused-variable" )
set( CMAKE_CXX_FLAGS_DEBUG "-g" )
set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG" )
#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG" )
#set( CMAKE_CXX_FLAGS_RELEASE "-O3 -march=native -mtune=native -DNDEBUG -ftree-vectorizer-verbose=1 -ftree-vectorize -fopt-info-vec-missed -funroll-loops" )
# pass -rdynamic only in Debug mode
set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS "" )
set( CMAKE_SHARED_LIBRARY_LINK_C_FLAGS_DEBUG "-rdynamic" )
+6 −0
Original line number Diff line number Diff line
@@ -55,6 +55,12 @@ String :: String( int number )
   this->setString( convertToString( number ).getString() );
}

String :: String( unsigned long int number )
: string( 0 ), length( 0 )
{
   this->setString( convertToString( number ).getString() );
}

String :: String( long int number )
: string( 0 ), length( 0 )
{
+2 −0
Original line number Diff line number Diff line
@@ -55,6 +55,8 @@ class String

   String( int number );
 
   String( unsigned long int number );

   String( long int number );

   String( float number );
+14 −19
Original line number Diff line number Diff line
@@ -2,11 +2,11 @@ ADD_SUBDIRECTORY( share )
ADD_SUBDIRECTORY( heat-equation-benchmark )

IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( tnl-cuda-benchmarks${debugExt} tnl-cuda-benchmarks.cu )
    CUDA_ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cu )
    if( HAVE_CUBLAS STREQUAL "yes" )
        CUDA_ADD_CUBLAS_TO_TARGET( tnl-cuda-benchmarks${debugExt} )
        CUDA_ADD_CUBLAS_TO_TARGET( tnl-benchmark-blas${debugExt} )
    endif()
    TARGET_LINK_LIBRARIES( tnl-cuda-benchmarks${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )                        
    TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )

    CUDA_ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cu )
    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
@@ -14,6 +14,9 @@ IF( BUILD_CUDA )
    CUDA_ADD_EXECUTABLE( tnl-benchmark-linear-solvers${debugExt} tnl-benchmark-linear-solvers.cu )
    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} ${CUSPARSE_LIBRARY} )
ELSE()
    ADD_EXECUTABLE( tnl-benchmark-blas${debugExt} tnl-benchmark-blas.cpp )
    TARGET_LINK_LIBRARIES( tnl-benchmark-blas${debugExt} tnl${debugExt}-${tnlVersion} )

    ADD_EXECUTABLE( tnl-benchmark-spmv${debugExt} tnl-benchmark-spmv.cpp )
    TARGET_LINK_LIBRARIES( tnl-benchmark-spmv${debugExt} tnl${debugExt}-${tnlVersion} )

@@ -21,16 +24,8 @@ ELSE()
    TARGET_LINK_LIBRARIES( tnl-benchmark-linear-solvers${debugExt} tnl${debugExt}-${tnlVersion} )
ENDIF()

if( BUILD_CUDA )                                                              
   INSTALL( TARGETS
                tnl-cuda-benchmarks${debugExt}
            RUNTIME DESTINATION bin )
endif()

INSTALL( TARGETS
            tnl-benchmark-blas${debugExt}
            tnl-benchmark-spmv${debugExt}
            tnl-benchmark-linear-solvers${debugExt}
         RUNTIME DESTINATION bin )


                                            
+38 −18
Original line number Diff line number Diff line
@@ -14,7 +14,7 @@ template< typename Real = double,
bool
benchmarkArrayOperations( Benchmark & benchmark,
                          const int & loops,
                          const int & size )
                          const long & size )
{
    typedef Containers::Array< Real, Devices::Host, Index > HostArray;
    typedef Containers::Array< Real, Devices::Cuda, Index > CudaArray;
@@ -25,9 +25,14 @@ benchmarkArrayOperations( Benchmark & benchmark,
    HostArray hostArray, hostArray2;
    CudaArray deviceArray, deviceArray2;
    if( ! hostArray.setSize( size ) ||
        ! hostArray2.setSize( size ) ||
        ! hostArray2.setSize( size )
#ifdef HAVE_CUDA
        ||
        ! deviceArray.setSize( size ) ||
        ! deviceArray2.setSize( size ) )
        ! deviceArray2.setSize( size )
#endif
    )

    {
        const char* msg = "error: allocation of arrays failed";
        std::cerr << msg << std::endl;
@@ -41,11 +46,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
    // reset functions
    auto reset1 = [&]() {
        hostArray.setValue( 1.0 );
#ifdef HAVE_CUDA
        deviceArray.setValue( 1.0 );
#endif
    };
    auto reset2 = [&]() {
        hostArray2.setValue( 1.0 );
#ifdef HAVE_CUDA
        deviceArray2.setValue( 1.0 );
#endif
    };
    auto reset12 = [&]() {
        reset1();
@@ -63,9 +72,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
        resultDevice = (int) deviceArray == deviceArray2;
    };
    benchmark.setOperation( "comparison (operator==)", 2 * datasetSize );
    benchmark.time( reset1,
                    "CPU", compareHost,
                    "GPU", compareCuda );
    benchmark.time( reset1, "CPU", compareHost );
#ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", compareCuda );
#endif


    auto copyAssignHostHost = [&]() {
@@ -75,9 +85,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
        deviceArray = deviceArray2;
    };
    benchmark.setOperation( "copy (operator=)", 2 * datasetSize );
    double basetime = benchmark.time( reset1,
                    "CPU", copyAssignHostHost,
                    "GPU", copyAssignCudaCuda );
    benchmark.time( reset1, "CPU", copyAssignHostHost );
#ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", copyAssignCudaCuda );
#endif


    auto copyAssignHostCuda = [&]() {
@@ -86,10 +97,12 @@ benchmarkArrayOperations( Benchmark & benchmark,
    auto copyAssignCudaHost = [&]() {
        hostArray = deviceArray;
    };
#ifdef HAVE_CUDA
    benchmark.setOperation( "copy (operator=)", datasetSize, basetime );
    benchmark.time( reset1,
                    "CPU->GPU", copyAssignHostCuda,
                    "GPU->CPU", copyAssignCudaHost );
#endif


    auto setValueHost = [&]() {
@@ -99,9 +112,10 @@ benchmarkArrayOperations( Benchmark & benchmark,
        deviceArray.setValue( 3.0 );
    };
    benchmark.setOperation( "setValue", datasetSize );
    benchmark.time( reset1,
                    "CPU", setValueHost,
                    "GPU", setValueCuda );
    benchmark.time( reset1, "CPU", setValueHost );
#ifdef HAVE_CUDA
    benchmark.time( reset1, "GPU", setValueCuda );
#endif


    auto setSizeHost = [&]() {
@@ -112,12 +126,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
    auto resetSize1 = [&]() {
        hostArray.reset();
#ifdef HAVE_CUDA
        deviceArray.reset();
#endif
    };
    benchmark.setOperation( "allocation (setSize)", datasetSize );
    benchmark.time( resetSize1,
                    "CPU", setSizeHost,
                    "GPU", setSizeCuda );
    benchmark.time( resetSize1, "CPU", setSizeHost );
#ifdef HAVE_CUDA
    benchmark.time( resetSize1, "GPU", setSizeCuda );
#endif


    auto resetSizeHost = [&]() {
@@ -128,12 +145,15 @@ benchmarkArrayOperations( Benchmark & benchmark,
    };
    auto setSize1 = [&]() {
        hostArray.setSize( size );
#ifdef HAVE_CUDA
        deviceArray.setSize( size );
#endif
    };
    benchmark.setOperation( "deallocation (reset)", datasetSize );
    benchmark.time( setSize1,
                    "CPU", resetSizeHost,
                    "GPU", resetSizeCuda );
    benchmark.time( setSize1, "CPU", resetSizeHost );
#ifdef HAVE_CUDA
    benchmark.time( setSize1, "GPU", resetSizeCuda );
#endif

    return true;
}
Loading