Commit 7d4d00e7 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Benchmarks: refactoring, generalization for other operations

parent 2a5e19c3
Loading
Loading
Loading
Loading
+70 −66
Original line number Diff line number Diff line
#pragma once

#include <iostream>
#include <iomanip>

#include <core/tnlTimerRT.h>

@@ -11,95 +12,98 @@ namespace benchmarks

const double oneGB = 1024.0 * 1024.0 * 1024.0;

// TODO: add data member for error message
struct BenchmarkError {};

auto trueFunc = []() { return true; };
auto voidFunc = [](){};

template< typename ComputeFunction,
          typename CheckFunction,
          typename ResetFunction >
double
benchmarkSingle( const int & loops,
timeFunction( ComputeFunction compute,
              ResetFunction reset,
              const int & loops,
              const double & datasetSize, // in GB
                 ComputeFunction compute,
                 CheckFunction check,
                 ResetFunction reset )
              const double & baseTime, // in seconds (baseline for speedup calculation)
              const char* performer )
{
    // the timer is constructed zero-initialized and stopped
    tnlTimerRT timer;
    timer.reset();

    reset();
    for(int i = 0; i < loops; ++i) {
        // TODO: not necessary for host computations
        // Explicit synchronization of the CUDA device
#ifdef HAVE_CUDA
        cudaDeviceSynchronize();
#endif
        timer.start();
        compute();
#ifdef HAVE_CUDA
        cudaDeviceSynchronize();
#endif
        timer.stop();

        if( ! check() )
            throw BenchmarkError();

        reset();
    }

    const double time = timer.getTime();
    const double bandwidth = datasetSize / time;
    std::cout << "bandwidth: " << bandwidth << " GB/sec, time: " << time << " sec." << std::endl;

    using namespace std;
    cout << "  " << performer << ": bandwidth: "
         << setw( 8 ) << bandwidth << " GB/sec, time: "
         << setw( 8 ) << time << " sec, speedup: ";
    if( baseTime )
        cout << baseTime / time << endl;
    else
        cout << "N/A" << endl;

    return time;
}

template< typename ComputeHostFunction,
          typename ComputeCudaFunction,
          typename CheckFunction,
          typename ResetFunction >
void
benchmarkCuda( const int & loops,
               const double & datasetSize, // in GB
               ComputeHostFunction computeHost,
               ComputeCudaFunction computeCuda,
               CheckFunction check,
               ResetFunction reset )
// This specialization terminates the recursion
template< typename ResetFunction,
          typename ComputeFunction >
inline void
benchmarkNextOperation( const double & datasetSize,
                        const int & loops,
                        ResetFunction reset,
                        const double & baseTime,
                        const char* performer,
                        ComputeFunction compute )
{
    // timers are constructed zero-initialized and stopped
    tnlTimerRT timerHost, timerCuda, timerCudaSync;

    for(int i = 0; i < loops; ++i) {
        timerHost.start();
        computeHost();
        timerHost.stop();

        timerCuda.start();
        computeCuda();
        timerCuda.stop();

        if( ! check() )
            throw BenchmarkError();

        reset();

        // Compute again on CUDA, with explicit synchronization
#ifdef HAVE_CUDA
        cudaDeviceSynchronize();
        timerCudaSync.start();
        computeCuda();
        cudaDeviceSynchronize();
        timerCudaSync.stop();
#endif
    timeFunction( compute, reset, loops, datasetSize, baseTime, performer );
}

        reset();
// Recursive template function to deal with benchmarks involving multiple computations
template< typename ResetFunction,
          typename ComputeFunction,
          typename... NextComputations >
inline void
benchmarkNextOperation( const double & datasetSize,
                        const int & loops,
                        ResetFunction reset,
                        const double & baseTime,
                        const char* performer,
                        ComputeFunction compute,
                        NextComputations & ... nextComputations )
{
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, performer, compute );
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
}

    const double timeHost = timerHost.getTime();
    const double timeCuda = timerCuda.getTime();
    const double timeCudaSync = timerCudaSync.getTime();
    const double bandwidthHost = datasetSize / timeHost;
    const double bandwidthCuda = datasetSize / timeCuda;
    const double bandwidthCudaSync = datasetSize / timeCudaSync;
    std::cout << "  CPU: bandwidth: " << bandwidthHost << " GB/sec, time: " << timeHost << " sec." << std::endl;
    std::cout << "  GPU: bandwidth: " << bandwidthCuda << " GB/sec, time: " << timeCuda << " sec." << std::endl;
    std::cout << "  GPU (sync): bandwidth: " << bandwidthCudaSync << " GB/sec, time: " << timeCudaSync << " sec." << std::endl;
    std::cout << "  CPU/GPU speedup: " << timeHost / timeCuda << std::endl;
    std::cout << "  CPU/GPU (sync) speedup: " << timeHost / timeCudaSync << std::endl;
// Main function for benchmarking
template< typename ResetFunction,
          typename ComputeFunction,
          typename... NextComputations >
void
benchmarkOperation( const char* operation,
                    const double & datasetSize,
                    const int & loops,
                    ResetFunction reset,
                    const char* performer,
                    ComputeFunction computeBase,
                    NextComputations... nextComputations )
{
    cout << "Benchmarking " << operation << ":" << endl;
    double baseTime = timeFunction( computeBase, reset, loops, datasetSize, 0.0, performer );
    benchmarkNextOperation( datasetSize, loops, reset, baseTime, nextComputations... );
    std::cout << std::endl;
}

+10 −8
Original line number Diff line number Diff line
@@ -29,6 +29,8 @@
using namespace tnl::benchmarks;


// TODO: should benchmarks check the result of the computation?

// silly alias to match the number of template parameters with other formats
template< typename Real, typename Device, typename Index >
using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;
@@ -147,19 +149,17 @@ benchmarkSpMV( const int & loops,

   tnlList< tnlString > parsedType;
   parseObjectType( HostMatrix::getType(), parsedType );
   cout << "Benchmarking SpMV (matrix type: " << parsedType[ 0 ] << ", rows: " << size << ", elements per row: " << elementsPerRow << "):" << endl;
   tnlString operationDescription = tnlString("SpMV (matrix type: ") + parsedType[ 0 ]
        + ", rows: " + tnlString(size) + ", elements per row: " + tnlString(elementsPerRow) + ")";

   const int elements = setHostTestMatrix< HostMatrix >( hostMatrix, elementsPerRow );
   setCudaTestMatrix< DeviceMatrix >( deviceMatrix, elementsPerRow );
   const double datasetSize = loops * elements * ( 2 * sizeof( Real ) + sizeof( int ) ) / oneGB;
   hostVector.setValue( 1.0 );
   deviceVector.setValue( 1.0 );

   // check and reset functions
   auto check = [&]() {
      return hostVector2 == deviceVector2;
   };
   // reset function
   auto reset = [&]() {
      hostVector.setValue( 1.0 );
      deviceVector.setValue( 1.0 );
      hostVector2.setValue( 0.0 );
      deviceVector2.setValue( 0.0 );
   };
@@ -172,7 +172,9 @@ benchmarkSpMV( const int & loops,
      deviceMatrix.vectorProduct( deviceVector, deviceVector2 );
   };

   benchmarkCuda( loops, datasetSize, spmvHost, spmvCuda, check, reset );
   benchmarkOperation( operationDescription.getString(), 2 * datasetSize, loops, reset,
                       "CPU", spmvHost,
                       "GPU", spmvCuda );

   return true;
}
+44 −51
Original line number Diff line number Diff line
@@ -37,21 +37,10 @@ benchmarkVectorOperations( const int & loops,
    Real resultHost, resultDevice;


    // check functions
    auto compare1 = [&]() {
        return hostVector == deviceVector;
    };
    auto compare2 = [&]() {
        return hostVector2 == deviceVector2;
    };
    auto compare12 = [&]() {
        return compare1() && compare2();
    };
    auto compareScalars = [&]() {
        return resultHost == resultDevice;
    };

    // reset functions
    // (Make sure to always use some in benchmarks, even if it's not necessary
    // to assure correct result - it helps to clear cache and avoid optimizations
    // of the benchmark loop.)
    auto reset1 = [&]() {
        hostVector.setValue( 1.0 );
        deviceVector.setValue( 1.0 );
@@ -68,148 +57,152 @@ benchmarkVectorOperations( const int & loops,

    reset12();

    cout << "Benchmarking CPU-CPU memory transfer:" << endl;

    auto copyAssignHostHost = [&]() {
        hostVector = hostVector2;
    };
    cout << "  ";
    benchmarkSingle( loops, datasetSize, copyAssignHostHost, trueFunc, reset1 );

    cout << "Benchmarking CPU-GPU memory transfer:" << endl;
    auto copyAssignHostCuda = [&]() {
        deviceVector = hostVector;
    };
    cout << "  ";
    benchmarkSingle( loops, datasetSize, copyAssignHostCuda, compare1, reset1 );

    cout << "Benchmarking GPU-GPU memory transfer:" << endl;
    auto copyAssignCudaCuda = [&]() {
        deviceVector = hostVector;
    };
    cout << "  ";
    benchmarkSingle( loops, datasetSize, copyAssignCudaCuda, trueFunc, reset1 );

    cout << endl;
    benchmarkOperation( "copy assigment", datasetSize, loops, reset1,
                        "CPU->CPU", copyAssignHostHost,
                        "CPU->GPU", copyAssignHostCuda,
                        "GPU->GPU", copyAssignCudaCuda );


    cout << "Benchmarking tnlVector.operator==" << endl;
    auto compareHost = [&]() {
        resultHost = (int) hostVector == hostVector2;
    };
    auto compareCuda = [&]() {
        resultDevice = (int) deviceVector == deviceVector2;
    };
    benchmarkCuda( loops, 2 * datasetSize, compareHost, compareCuda, compareScalars, voidFunc );
    benchmarkOperation( "comparison (operator==)", 2 * datasetSize, loops, reset1,
                        "CPU", compareHost,
                        "GPU", compareCuda );


    cout << "Benchmarking scalar multiplication:" << endl;
    auto multiplyHost = [&]() {
        hostVector *= 0.5;
    };
    auto multiplyCuda = [&]() {
        deviceVector *= 0.5;
    };
    benchmarkCuda( loops, 2 * datasetSize, multiplyHost, multiplyCuda, compare1, reset1 );
    benchmarkOperation( "scalar multiplication", 2 * datasetSize, loops, reset1,
                        "CPU", multiplyHost,
                        "GPU", multiplyCuda );


    cout << "Benchmarking vector addition:" << endl;
    auto addVectorHost = [&]() {
        hostVector.addVector( hostVector2 );
    };
    auto addVectorCuda = [&]() {
        deviceVector.addVector( deviceVector2 );
    };
    benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
    benchmarkOperation( "vector addition", 3 * datasetSize, loops, reset1,
                        "CPU", addVectorHost,
                        "GPU", addVectorCuda );


    cout << "Benchmarking max:" << endl;
    auto maxHost = [&]() {
        resultHost = hostVector.max();
    };
    auto maxCuda = [&]() {
        resultDevice = deviceVector.max();
    };
    benchmarkCuda( loops, datasetSize, maxHost, maxCuda, compareScalars, voidFunc );
    benchmarkOperation( "max", datasetSize, loops, reset1,
                        "CPU", maxHost,
                        "GPU", maxCuda );


    cout << "Benchmarking min:" << endl;
    auto minHost = [&]() {
        resultHost = hostVector.min();
    };
    auto minCuda = [&]() {
        resultDevice = deviceVector.min();
    };
    benchmarkCuda( loops, datasetSize, minHost, minCuda, compareScalars, voidFunc );
    benchmarkOperation( "min", datasetSize, loops, reset1,
                        "CPU", minHost,
                        "GPU", minCuda );


    cout << "Benchmarking absMax:" << endl;
    auto absMaxHost = [&]() {
        resultHost = hostVector.absMax();
    };
    auto absMaxCuda = [&]() {
        resultDevice = deviceVector.absMax();
    };
    benchmarkCuda( loops, datasetSize, absMaxHost, absMaxCuda, compareScalars, voidFunc );
    benchmarkOperation( "absMax", datasetSize, loops, reset1,
                        "CPU", absMaxHost,
                        "GPU", absMaxCuda );


    cout << "Benchmarking absMin:" << endl;
    auto absMinHost = [&]() {
        resultHost = hostVector.absMin();
    };
    auto absMinCuda = [&]() {
        resultDevice = deviceVector.absMin();
    };
    benchmarkCuda( loops, datasetSize, absMinHost, absMinCuda, compareScalars, voidFunc );
    benchmarkOperation( "absMin", datasetSize, loops, reset1,
                        "CPU", absMinHost,
                        "GPU", absMinCuda );


    cout << "Benchmarking sum:" << endl;
    auto sumHost = [&]() {
        resultHost = hostVector.sum();
    };
    auto sumCuda = [&]() {
        resultDevice = deviceVector.sum();
    };
    benchmarkCuda( loops, datasetSize, sumHost, sumCuda, compareScalars, voidFunc );
    benchmarkOperation( "sum", datasetSize, loops, reset1,
                        "CPU", sumHost,
                        "GPU", sumCuda );


    cout << "Benchmarking l1 norm: " << endl;
    auto l1normHost = [&]() {
        resultHost = hostVector.lpNorm( 1.0 );
    };
    auto l1normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 1.0 );
    };
    benchmarkCuda( loops, datasetSize, l1normHost, l1normCuda, compareScalars, voidFunc );
    benchmarkOperation( "l1 norm", datasetSize, loops, reset1,
                        "CPU", l1normHost,
                        "GPU", l1normCuda );


    cout << "Benchmarking l2 norm: " << endl;
    auto l2normHost = [&]() {
        resultHost = hostVector.lpNorm( 2.0 );
    };
    auto l2normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 2.0 );
    };
    benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
    benchmarkOperation( "l2 norm", datasetSize, loops, reset1,
                        "CPU", l2normHost,
                        "GPU", l2normCuda );


    cout << "Benchmarking l3 norm: " << endl;
    auto l3normHost = [&]() {
        resultHost = hostVector.lpNorm( 3.0 );
    };
    auto l3normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 3.0 );
    };
    benchmarkCuda( loops, datasetSize, l3normHost, l3normCuda, compareScalars, voidFunc );
    benchmarkOperation( "l3 norm", datasetSize, loops, reset1,
                        "CPU", l3normHost,
                        "GPU", l3normCuda );


    cout << "Benchmarking scalar product:" << endl;
    auto scalarProductHost = [&]() {
        resultHost = hostVector.scalarProduct( hostVector2 );
    };
    auto scalarProductCuda = [&]() {
        resultDevice = deviceVector.scalarProduct( deviceVector2 );
    };
    benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );
    benchmarkOperation( "scalar product", 2 * datasetSize, loops, reset1,
                        "CPU", scalarProductHost,
                        "GPU", scalarProductCuda );

/* TODO
#ifdef HAVE_CUBLAS