Commit 3aa06ead authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing CUDA benchmark.

parent 20aa4678
Loading
Loading
Loading
Loading
+1 −60
Original line number Diff line number Diff line
@@ -15,63 +15,4 @@
 *                                                                         *
 ***************************************************************************/

#include <core/vectors/tnlVector.h>
#include <core/tnlTimerRT.h>

int main( int argc, char* argv[] )
{
#ifdef HAVE_CUDA

    tnlTimerRT timer;
    const double oneGB = 1024.0 * 1024.0 * 1024.0;

    cout << "Benchmarking memory bandwidth: ";

    const int size = 1 << 22;
    
    typedef tnlVector< double, tnlHost > HostVector;
    typedef tnlVector< double, tnlCuda > CudaVector;

    HostVector hostVector;
    CudaVector deviceVector;
    hostVector.setSize( size );
    deviceVector.setSize( size );

    hostVector.setValue( 1.0 );
    deviceVector.setValue( 0.0 );
    
    timer.reset();
    timer.start();
    deviceVector = hostVector;
    timer.stop();    
    double bandwidth = ( double ) ( size ) * sizeof( int ) / timer.getTime() / oneGB;
    cout << bandwidth << " GB/sec." << endl;

   
    
    HostVector hostVector2;
    CudaVector deviceVector2;
    hostVector2.setLike( hostVector );
    deviceVector2.setLike( deviceVector );
    hostVector2.setValue( 1.0 );
    deviceVector2.setValue( 1.0 );
    cout << "Benchmarking vector addition on CPU: ";
    timer.reset();
    timer.start();
    hostVector.addVector( hostVector2 );
    timer.stop();
    double hostTime = timer.getTime();











#endif
   return EXIT_SUCCESS;
}
#include "tnl-cuda-benchmarks.h"
 No newline at end of file
+45 −232
Original line number Diff line number Diff line
@@ -15,243 +15,56 @@
 *                                                                         *
 ***************************************************************************/

#ifndef TNLBENCHMARKS_H_
#define TNLBENCHMARKS_H_
#ifndef TNLCUDABENCHMARKS_H_
#define TNLCUDBENCHMARKS_H_

#include <core/mfuncs.h>
#include <core/tnlTimerCPU.h>
#include <../tests/unit-tests/core/tnl-cuda-kernels.h>
#include <core/low-level/cuda-long-vector-kernels.h>
#include <core/vectors/tnlVector.h>
#include <core/tnlTimerRT.h>

template< class T >
bool transferBenchmark( const int size,
                        double& host_to_host_band_width,
                        double& host_to_device_band_width,
                        double& device_to_host_band_width,
                        double& device_to_device_band_width )
int main( int argc, char* argv[] )
{
#ifdef HAVE_CUDA

  tnlVector< T > host_vector( "transferBenchmark:host-vector", size );
  tnlVector< T > host_vector2( "transferBenchmark:host-vector-2", size );
  tnlVector< T, tnlCuda > device_vector( "transferBenchmark:device-vector", size );
  tnlVector< T, tnlCuda > device_vector2( "transferBenchmark:device-vector-2", size );

   for( int i = 0; i < size; i ++ )
      host_vector[ i ] = i + 1;

   const long int cycles = 100;
   long int bytes = cycles * size * sizeof( int );
   long int mega_byte = 1 << 20;

   tnlTimerCPU timer;
   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      host_vector2 = host_vector;

   double time = timer. getTime();
   double giga_byte = ( double ) ( 1 << 30 );
   host_to_host_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      device_vector = host_vector;

   time = timer. getTime();
   host_to_device_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      host_vector2 = device_vector;

   time = timer. getTime();
   device_to_host_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      device_vector2 = device_vector;


   time = timer. getTime();

   // Since we read and write tha data back we process twice as many bytes.
   bytes *= 2;
   device_to_device_band_width = bytes / giga_byte / time;
    tnlTimerRT timer;
    const double oneGB = 1024.0 * 1024.0 * 1024.0;

   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl;
}
    cout << "Benchmarking memory bandwidth: ";

template< class T >
void tnlCPUReductionSum( const tnlVector< T >& host_vector,
                         T& sum )
{
   const T* data = host_vector. getData();
   const int size = host_vector. getSize();
   sum = 0.0;
   for( int i = 0; i < size; i ++ )
      sum += data[ i ];
};
    const int size = 1 << 22;
    
template< class T >
void tnlCPUReductionMin( const tnlVector< T >& host_vector,
                         T& min )
{
   const T* data = host_vector. getData();
   const int size = host_vector. getSize();
   //tnlAssert( data );
   min = data[ 0 ];
   for( int i = 1; i < size; i ++ )
      min = :: Min( min,  data[ i ] );
};
    typedef tnlVector< double, tnlHost > HostVector;
    typedef tnlVector< double, tnlCuda > CudaVector;

template< class T >
void tnlCPUReductionMax( const tnlVector< T >& host_vector,
                         T& max )
{
   const T* data = host_vector. getData();
   const int size = host_vector. getSize();
   //tnlAssert( data );
   max = data[ 0 ];
   for( int i = 1; i < size; i ++ )
      max = :: Max( max,  data[ i ] );
};

template< class T >
void reductionBenchmark( const int size,
                         const int algorithm )
{
   tnlVector< T > host_vector( "reductionBenchmark:host-vector", size );
   tnlVector< T, tnlCuda > device_vector( "reductionBenchmark:device-vector", size );
   tnlVector< T, tnlCuda > device_aux( "reductionBenchmark:device-aux", size / 2 );
    HostVector hostVector;
    CudaVector deviceVector;
    hostVector.setSize( size );
    deviceVector.setSize( size );

   for( int i = 0; i < size; i ++ )
      host_vector[ i ] = i + 1;
    hostVector.setValue( 1.0 );
    deviceVector.setValue( 0.0 );
    
   device_vector = host_vector;
    timer.reset();
    timer.start();
    deviceVector = hostVector;
    timer.stop();    
    double bandwidth = ( double ) ( size ) * sizeof( int ) / timer.getTime() / oneGB;
    cout << bandwidth << " GB/sec." << endl;
    
   T sum, min, max;
   const long int reducing_cycles( 1 );
    HostVector hostVector2;
    CudaVector deviceVector2;
    hostVector2.setLike( hostVector );
    deviceVector2.setLike( deviceVector );
    hostVector2.setValue( 1.0 );
    deviceVector2.setValue( 1.0 );
    cout << "Benchmarking vector addition on CPU: ";
    timer.reset();
    timer.start();
    hostVector.addVector( hostVector2 );
    timer.stop();
    double hostTime = timer.getTime();

   tnlTimerCPU timer;
   timer. Reset();
   for( int i = 0; i < reducing_cycles; i ++ )
   {
      switch( algorithm )
      {
         case 0:  // reduction on CPU
            tnlCPUReductionSum( host_vector, sum );
            tnlCPUReductionMin( host_vector, sum );
            tnlCPUReductionMax( host_vector, sum );
#ifdef HAVE_CUDA
         case 1:
            tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getData(),
                                                                   sum,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getData(),
                                                                   min,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getData(),
                                                                   max,
                                                                   device_aux. getData() );
            break;
         case 2:
            tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getData(),
                                                                   sum,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getData(),
                                                                   min,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getData(),
                                                                   max,
                                                                   device_aux. getData() );
            break;
         case 3:
            tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getData(),
                                                                   sum,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getData(),
                                                                   min,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getData(),
                                                                   max,
                                                                   device_aux. getData() );
            break;
         case 4:
            tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getData(),
                                                                   sum,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getData(),
                                                                   min,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getData(),
                                                                   max,
                                                                   device_aux. getData() );
            break;
         case 5:
            tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getData(),
                                                                   sum,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getData(),
                                                                   min,
                                                                   device_aux. getData() );
            tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getData(),
                                                                   max,
                                                                   device_aux. getData() );
            break;
         default:
            reductionOnCudaDevice< T, T, int, tnlParallelReductionSum >( size,
                                                                              device_vector. getData(),
                                                                              NULL,
                                                                              sum,
                                                                              0.0,
                                                                              device_aux. getData() );
            reductionOnCudaDevice< T, T, int, tnlParallelReductionMin >( size,
                                                                              device_vector. getData(),
                                                                              NULL,
                                                                              min,
                                                                              0.0,
                                                                              device_aux. getData() );
            reductionOnCudaDevice< T, T, int, tnlParallelReductionMax >( size,
                                                                              device_vector. getData(),
                                                                              NULL,
                                                                              max,
                                                                              0.0,
                                                                              device_aux. getData() );
#endif

      }
   }
   const double time = timer. getTime();
   double giga_byte = ( double ) ( 1 << 30 );
   long int mega_byte = 1 << 20;
   long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
   const double reduction_band_width = bytes_reduced / giga_byte / time;

   cout << "Reducing " << bytes_reduced / mega_byte
        << " MB on DEVICE using algorithm " << algorithm
        << " took " << time
        << " seconds. Bandwidth is " << reduction_band_width
        << " GB/s." << endl;
   return EXIT_SUCCESS;
}

#endif /* TNLBENCHMARKS_H_ */
#endif /* TNLCUDABENCHMARKS_H_ */