Commit 742581b2 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Splitting benchmarks of vector operations into separate header

parent da855c19
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -9,6 +9,8 @@ namespace tnl
namespace benchmarks
{

const double oneGB = 1024.0 * 1024.0 * 1024.0;

// TODO: add data member for error message
struct BenchmarkError {};

+2 −139
Original line number Diff line number Diff line
@@ -19,13 +19,12 @@
#define TNLCUDBENCHMARKS_H_

#include <tnlConfig.h>
#include <core/vectors/tnlVector.h>
#include <core/tnlList.h>
#include <matrices/tnlSlicedEllpackMatrix.h>
#include <matrices/tnlEllpackMatrix.h>
#include <matrices/tnlCSRMatrix.h>

#include "benchmarks.h"
#include "vector-operations.h"

using namespace tnl::benchmarks;

@@ -37,7 +36,6 @@ using namespace tnl::benchmarks;
template< typename Real, typename Device, typename Index >
using SlicedEllpackMatrix = tnlSlicedEllpackMatrix< Real, Device, Index >;

const double oneGB = 1024.0 * 1024.0 * 1024.0;


// TODO:
@@ -199,9 +197,6 @@ int main( int argc, char* argv[] )
#ifdef HAVE_CUDA
   
   typedef double Real;
   typedef tnlVector< Real, tnlHost > HostVector;
   typedef tnlVector< Real, tnlCuda > CudaVector;

   
   /****
    * The first argument of this program is the size od data set to be reduced.
@@ -217,139 +212,7 @@ int main( int argc, char* argv[] )
   if( argc > 3 )
      elementsPerRow = atoi( argv[ 3 ] );
   
   
   double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
   
   HostVector hostVector, hostVector2;
   CudaVector deviceVector, deviceVector2;
   hostVector.setSize( size );
   if( ! deviceVector.setSize( size ) )
      return EXIT_FAILURE;
   hostVector2.setLike( hostVector );
   if( ! deviceVector2.setLike( deviceVector ) )
      return EXIT_FAILURE;

   Real resultHost, resultDevice;


   // check functions
   auto compare1 = [&]() {
      return hostVector == deviceVector;
   };
   auto compare2 = [&]() {
      return hostVector2 == deviceVector2;
   };
   auto compare12 = [&]() {
      return compare1() && compare2();
   };
   auto compareScalars = [&]() {
      return resultHost == resultDevice;
   };

   // reset functions
   auto reset1 = [&]() {
      hostVector.setValue( 1.0 );
      deviceVector.setValue( 1.0 );
   };
   auto reset2 = [&]() {
      hostVector2.setValue( 1.0 );
      deviceVector2.setValue( 1.0 );
   };
   auto reset12 = [&]() {
      reset1();
      reset2();
   };


   reset12();

   cout << "Benchmarking CPU-GPU memory transfer:" << endl;
   auto copyAssign = [&]() {
      deviceVector = hostVector;
   };
   cout << "  ";
   benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 );
    

   cout << "Benchmarking vector addition:" << endl;
   auto addVectorHost = [&]() {
      hostVector.addVector( hostVector2 );
   };
   auto addVectorCuda = [&]() {
      deviceVector.addVector( deviceVector2 );
      // TODO: synchronization should be part of addVector
      cudaThreadSynchronize();
   };
   benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );


   cout << "Benchmarking scalar product:" << endl;
   auto scalarProductHost = [&]() {
      resultHost = hostVector.scalarProduct( hostVector2 );
   };
   auto scalarProductCuda = [&]() {
      resultDevice = deviceVector.scalarProduct( deviceVector2 );
   };
   benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );

/* TODO
#ifdef HAVE_CUBLAS
   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
   cublasHandle_t handle;
   cublasCreate( &handle );
   timer.reset();
   timer.start();   
   for( int i = 0; i < loops; i++ )
      cublasDdot( handle,
                  size,
                  deviceVector.getData(), 1,
                  deviceVector.getData(), 1,
                  &resultDevice );
   cudaThreadSynchronize();
   timer.stop();
   bandwidth = 2 * datasetSize / timer.getTime();
   cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
#endif    
*/

   cout << "Benchmarking L2 norm: " << endl;
   auto l2normHost = [&]() {
      resultHost = hostVector.lpNorm( 2.0 );
   };
   auto l2normCuda = [&]() {
      resultDevice = deviceVector.lpNorm( 2.0 );
   };
   benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );


   /*
   cout << "Benchmarking prefix-sum:" << endl;
   timer.reset();
   timer.start();
   hostVector.computePrefixSum();
   timer.stop();
   timeHost = timer.getTime();
   bandwidth = 2 * datasetSize / loops / timer.getTime();
   cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
   
   timer.reset();
   timer.start();
   deviceVector.computePrefixSum();
   timer.stop();
   timeDevice = timer.getTime();
   bandwidth = 2 * datasetSize / loops / timer.getTime();
   cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
   cout << "  CPU/GPU speedup: " << timeHost / timeDevice << endl;

   HostVector auxHostVector;
   auxHostVector.setLike( deviceVector );
   auxHostVector = deviceVector;
   for( int i = 0; i < size; i++ )
      if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
      {
         cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl;
      }
*/
   benchmarkVectorOperations< Real >( loops, size );

   benchmarkSpMV< Real, tnlEllpackMatrix >( loops, size, elementsPerRow );
   benchmarkSpMV< Real, SlicedEllpackMatrix >( loops, size, elementsPerRow );
+159 −0
Original line number Diff line number Diff line
#pragma once

#include "benchmarks.h"

#include <core/vectors/tnlVector.h>

namespace tnl
{
namespace benchmarks
{

template< typename Real = double,
          typename Index = int >
bool
benchmarkVectorOperations( const int & loops,
                           const int & size )
{
    typedef tnlVector< Real, tnlHost, Index > HostVector;
    typedef tnlVector< Real, tnlCuda, Index > CudaVector;
    using namespace std;

    double datasetSize = ( double ) ( loops * size ) * sizeof( Real ) / oneGB;
    
    HostVector hostVector, hostVector2;
    CudaVector deviceVector, deviceVector2;
    hostVector.setSize( size );
    if( ! deviceVector.setSize( size ) )
        return false;
    hostVector2.setLike( hostVector );
    if( ! deviceVector2.setLike( deviceVector ) )
        return false;
 
    Real resultHost, resultDevice;
 
 
    // check functions
    auto compare1 = [&]() {
        return hostVector == deviceVector;
    };
    auto compare2 = [&]() {
        return hostVector2 == deviceVector2;
    };
    auto compare12 = [&]() {
        return compare1() && compare2();
    };
    auto compareScalars = [&]() {
        return resultHost == resultDevice;
    };
 
    // reset functions
    auto reset1 = [&]() {
        hostVector.setValue( 1.0 );
        deviceVector.setValue( 1.0 );
    };
    auto reset2 = [&]() {
        hostVector2.setValue( 1.0 );
        deviceVector2.setValue( 1.0 );
    };
    auto reset12 = [&]() {
        reset1();
        reset2();
    };
 
 
    reset12();
 
    cout << "Benchmarking CPU-GPU memory transfer:" << endl;
    auto copyAssign = [&]() {
        deviceVector = hostVector;
    };
    cout << "  ";
    benchmarkSingle( loops, datasetSize, copyAssign, compare1, reset1 );
     
 
    cout << "Benchmarking vector addition:" << endl;
    auto addVectorHost = [&]() {
        hostVector.addVector( hostVector2 );
    };
    auto addVectorCuda = [&]() {
        deviceVector.addVector( deviceVector2 );
        // TODO: synchronization should be part of addVector
        cudaThreadSynchronize();
    };
    benchmarkCuda( loops, 3 * datasetSize, addVectorHost, addVectorCuda, compare1, reset1 );
 
 
    cout << "Benchmarking scalar product:" << endl;
    auto scalarProductHost = [&]() {
        resultHost = hostVector.scalarProduct( hostVector2 );
    };
    auto scalarProductCuda = [&]() {
        resultDevice = deviceVector.scalarProduct( deviceVector2 );
    };
    benchmarkCuda( loops, 2 * datasetSize, scalarProductHost, scalarProductCuda, compareScalars, voidFunc );

/* TODO
#ifdef HAVE_CUBLAS
   cout << "Benchmarking scalar product on GPU with Cublas: " << endl;
   cublasHandle_t handle;
   cublasCreate( &handle );
   timer.reset();
   timer.start();   
   for( int i = 0; i < loops; i++ )
      cublasDdot( handle,
                  size,
                  deviceVector.getData(), 1,
                  deviceVector.getData(), 1,
                  &resultDevice );
   cudaThreadSynchronize();
   timer.stop();
   bandwidth = 2 * datasetSize / timer.getTime();
   cout << "bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
#endif    
*/

    cout << "Benchmarking L2 norm: " << endl;
    auto l2normHost = [&]() {
        resultHost = hostVector.lpNorm( 2.0 );
    };
    auto l2normCuda = [&]() {
        resultDevice = deviceVector.lpNorm( 2.0 );
    };
    benchmarkCuda( loops, datasetSize, l2normHost, l2normCuda, compareScalars, voidFunc );
 
 
    /*
    cout << "Benchmarking prefix-sum:" << endl;
    timer.reset();
    timer.start();
    hostVector.computePrefixSum();
    timer.stop();
    timeHost = timer.getTime();
    bandwidth = 2 * datasetSize / loops / timer.getTime();
    cout << "  CPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
    
    timer.reset();
    timer.start();
    deviceVector.computePrefixSum();
    timer.stop();
    timeDevice = timer.getTime();
    bandwidth = 2 * datasetSize / loops / timer.getTime();
    cout << "  GPU: bandwidth: " << bandwidth << " GB/sec, time: " << timer.getTime() << " sec." << endl;
    cout << "  CPU/GPU speedup: " << timeHost / timeDevice << endl;
 
    HostVector auxHostVector;
    auxHostVector.setLike( deviceVector );
    auxHostVector = deviceVector;
    for( int i = 0; i < size; i++ )
       if( hostVector.getElement( i ) != auxHostVector.getElement( i ) )
       {
          cerr << "Error in prefix sum at position " << i << ":  " << hostVector.getElement( i ) << " != " << auxHostVector.getElement( i ) << endl;
       }
    */

    return true;
}

} // namespace benchmarks
} // namespace tnl