Commit ad22b77b authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implementing benchmarks for CUDA reduction.

parent f10011fa
Loading
Loading
Loading
Loading
+125 −125
Original line number Diff line number Diff line
@@ -955,7 +955,7 @@ bool tnlCUDASimpleReduction3( const int size,
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 256;    //Desired block size
   const int desBlockSize = 512;    //Desired block size

   bool device_output_allocated( false );
   if( ! device_output )
@@ -1066,7 +1066,7 @@ bool tnlCUDASimpleReduction2( const int size,
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 256;    //Desired block size
   const int desBlockSize = 512;    //Desired block size

   bool device_output_allocated( false );
   if( ! device_output )
@@ -1176,7 +1176,7 @@ bool tnlCUDASimpleReduction1( const int size,
{
   //Calculate necessary block/grid dimensions
   const int cpuThreshold = 1;
   const int desBlockSize = 256;    //Desired block size
   const int desBlockSize = 512;    //Desired block size

   bool device_output_allocated( false );
   if( ! device_output )
+102 −6
Original line number Diff line number Diff line
@@ -18,19 +18,112 @@
#include <core/tnlTimerRT.h>
#include <core/tnlLongVector.h>
#include <core/tnlLongVectorCUDA.h>
#include <core/tnl-cuda-kernels.cu.h>

void reductionBenchmark( const int size,
                         tnlLongVectorCUDA< int >& device_vector,
                         const int algorithm )
{
   const long int seq_sum = size * ( size - 1 ) / 2;
   const long int seq_max = size;
   const long int seq_min = 1;

   int sum, min, max;
   const long int sorting_cycles( 10 );

   tnlTimerRT timer;
   timer. Reset();
   for( int i = 0; i < sorting_cycles; i ++ )
   {
      switch( algorithm )
      {
         case 1:
            tnlCUDASimpleReduction1Sum( size,
                                        device_vector. Data(),
                                        sum );
            tnlCUDASimpleReduction1Min( size,
                                        device_vector. Data(),
                                        min );
            tnlCUDASimpleReduction1Max( size,
                                        device_vector. Data(),
                                        max );
            break;
         case 2:
            tnlCUDASimpleReduction2Sum( size,
                                        device_vector. Data(),
                                        sum );
            tnlCUDASimpleReduction2Min( size,
                                        device_vector. Data(),
                                        min );
            tnlCUDASimpleReduction2Max( size,
                                        device_vector. Data(),
                                        max );
            break;
         case 3:
            tnlCUDASimpleReduction3Sum( size,
                                        device_vector. Data(),
                                        sum );
            tnlCUDASimpleReduction3Min( size,
                                        device_vector. Data(),
                                        min );
            tnlCUDASimpleReduction3Max( size,
                                        device_vector. Data(),
                                        max );
            break;
         case 4:
            tnlCUDASimpleReduction4Sum( size,
                                        device_vector. Data(),
                                        sum );
            tnlCUDASimpleReduction4Min( size,
                                        device_vector. Data(),
                                        min );
            tnlCUDASimpleReduction4Max( size,
                                        device_vector. Data(),
                                        max );
            break;
         case 5:
            tnlCUDASimpleReduction5Sum( size,
                                        device_vector. Data(),
                                        sum );
            tnlCUDASimpleReduction5Min( size,
                                        device_vector. Data(),
                                        min );
            tnlCUDASimpleReduction5Max( size,
                                        device_vector. Data(),
                                        max );
            break;

      }
   }
   const double time = timer. GetTime();
   double giga_byte = ( double ) ( 1 << 30 );
   long int mega_byte = 1 << 20;
   long int bytes_reduced = size * sizeof( int ) * sorting_cycles * 3;
   const double reduction_band_width = bytes_reduced / giga_byte / time;
   cout << "Reducing " << bytes_reduced / mega_byte
        << " MB on DEVICE using algorithm " << algorithm
        << " took " << time
        << " seconds. Bandwidth is " << reduction_band_width
        << " GB/s." << endl;

}

int main( int argc, char* argv[] )
{
   cout << "Benchmarking memory bandwidth when transfering data ..." << endl;

   const long int size = 1 << 24;
   const long int size = 1 << 20;
   tnlLongVector< int > host_vector( size );
   tnlLongVector< int > host_vector2( size );
   tnlLongVectorCUDA< int > device_vector( size );
   tnlLongVectorCUDA< int > device_vector2( size );

   for( int i = 0; i < size; i ++ )
      host_vector[ i ] = i + 1;

   const long int cycles = 100;
   long int bytes = cycles * size * sizeof( int );
   long int mega_byte = 1 << 20;

   tnlTimerRT timer;
   timer. Reset();
@@ -40,7 +133,7 @@ int main( int argc, char* argv[] )
   double giga_byte = ( double ) ( 1 << 30 );
   const double host_to_host_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes << " bytes from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl;
   cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
@@ -48,15 +141,15 @@ int main( int argc, char* argv[] )
   time = timer. GetTime();
   const double host_to_device_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes << " bytes from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
   cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
         host_vector. copyFrom( device_vector );
         host_vector2. copyFrom( device_vector );
   time = timer. GetTime();
   const double device_to_host_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes << " bytes from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
@@ -66,6 +159,9 @@ int main( int argc, char* argv[] )
   time = timer. GetTime();
   const double device_to_device_band_width = bytes / giga_byte / time;

   cout << "Transfering " << bytes << " bytes from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl;
   cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl;

   for( int i = 1; i <= 4; i ++ )
      reductionBenchmark( size, device_vector, i );

}