Commit 632205a1 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Debugging TNL benchmarks.

parent 3fdeb2a6
Loading
Loading
Loading
Loading
+92 −85
Original line number Diff line number Diff line
@@ -21,6 +21,7 @@
#include <core/mfuncs.h>
#include <core/tnlTimerCPU.h>
#include <../tests/unit-tests/core/tnl-cuda-kernels.h>
#include <core/low-level/cuda-long-vector-kernels.h>

template< class T >
bool transferBenchmark( const int size,
@@ -45,8 +46,8 @@ bool transferBenchmark( const int size,
   tnlTimerCPU timer;
   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      if( ! host_vector2. copyFrom( host_vector ) )
         return false;
      host_vector2 = host_vector;

   double time = timer. GetTime();
   double giga_byte = ( double ) ( 1 << 30 );
   host_to_host_band_width = bytes / giga_byte / time;
@@ -55,8 +56,8 @@ bool transferBenchmark( const int size,

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      if( ! device_vector. copyFrom( host_vector ) )
         return false;
      device_vector = host_vector;

   time = timer. GetTime();
   host_to_device_band_width = bytes / giga_byte / time;

@@ -64,8 +65,8 @@ bool transferBenchmark( const int size,

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      if( ! host_vector2. copyFrom( device_vector ) )
         return false;
      host_vector2 = device_vector;

   time = timer. GetTime();
   device_to_host_band_width = bytes / giga_byte / time;

@@ -73,8 +74,8 @@ bool transferBenchmark( const int size,

   timer. Reset();
   for( int i = 0; i < cycles; i ++ )
      if( ! device_vector2. copyFrom( device_vector ) )
         return false;
      device_vector2 = device_vector;


   time = timer. GetTime();

@@ -89,7 +90,7 @@ template< class T >
void tnlCPUReductionSum( const tnlLongVector< T >& host_vector,
                         T& sum )
{
   const T* data = host_vector. Data();
   const T* data = host_vector. getVector();
   const int size = host_vector. getSize();
   sum = 0.0;
   for( int i = 0; i < size; i ++ )
@@ -100,7 +101,7 @@ template< class T >
void tnlCPUReductionMin( const tnlLongVector< T >& host_vector,
                         T& min )
{
   const T* data = host_vector. Data();
   const T* data = host_vector. getVector();
   const int size = host_vector. getSize();
   //tnlAssert( data );
   min = data[ 0 ];
@@ -112,7 +113,7 @@ template< class T >
void tnlCPUReductionMax( const tnlLongVector< T >& host_vector,
                         T& max )
{
   const T* data = host_vector. Data();
   const T* data = host_vector. getVector();
   const int size = host_vector. getSize();
   //tnlAssert( data );
   max = data[ 0 ];
@@ -131,12 +132,12 @@ void reductionBenchmark( const int size,
   for( int i = 0; i < size; i ++ )
      host_vector[ i ] = i + 1;

   device_vector. copyFrom( host_vector );
   device_vector = host_vector;

   T sum, min, max;
   const long int reducing_cycles( 10 );

   tnlTimerCUDA timer;
   tnlTimerCPU timer;
   timer. Reset();
   for( int i = 0; i < reducing_cycles; i ++ )
   {
@@ -147,87 +148,93 @@ void reductionBenchmark( const int size,
            tnlCPUReductionMin( host_vector, sum );
            tnlCPUReductionMax( host_vector, sum );
         case 1:
            tnlCUDASimpleReduction1Sum( size,
            tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getVector(),
                                                                   sum,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction1Min( size,
            tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getVector(),
                                                                   min,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction1Max( size,
            tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getVector(),
                                                                   max,
                                                                   device_aux. getVector() );
            break;
         case 2:
            tnlCUDASimpleReduction2Sum( size,
            tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getVector(),
                                                                   sum,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction2Min( size,
            tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getVector(),
                                                                   min,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction2Max( size,
            tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getVector(),
                                                                   max,
                                                                   device_aux. getVector() );
            break;
         case 3:
            tnlCUDASimpleReduction3Sum( size,
            tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getVector(),
                                                                   sum,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction3Min( size,
            tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getVector(),
                                                                   min,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction3Max( size,
            tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getVector(),
                                                                   max,
                                                                   device_aux. getVector() );
            break;
         case 4:
            tnlCUDASimpleReduction4Sum( size,
            tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getVector(),
                                                                   sum,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction4Min( size,
            tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getVector(),
                                                                   min,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction4Max( size,
            tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getVector(),
                                                                   max,
                                                                   device_aux. getVector() );
            break;
         case 5:
            tnlCUDASimpleReduction5Sum( size,
            tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size,
                                                                   device_vector. getVector(),
                                                                   sum,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction5Min( size,
            tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size,
                                                                   device_vector. getVector(),
                                                                   min,
                                                                   device_aux. getVector() );
            tnlCUDASimpleReduction5Max( size,
            tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size,
                                                                   device_vector. getVector(),
                                                                   max,
                                                                   device_aux. getVector() );
            break;
         default:
            tnlCUDAReductionSum( size,
            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size,
                                                                              device_vector. getVector(),
                                                                              NULL,
                                                                              sum,
                                                                              0.0,
                                                                              device_aux. getVector() );
            tnlCUDAReductionMin( size,
            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size,
                                                                              device_vector. getVector(),
                                                                              NULL,
                                                                              min,
                                                                              0.0,
                                                                              device_aux. getVector() );
            tnlCUDAReductionMax( size,
            tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size,
                                                                              device_vector. getVector(),
                                                                              NULL,
                                                                              max,
                                                                              0.0,
                                                                              device_aux. getVector() );

      }
+1 −1
Original line number Diff line number Diff line
@@ -37,7 +37,7 @@ using namespace std;
 *
 *
 * For the educative and also testing/debuging reasons we have 6 version of this algorithm here.
 * Version 1 is the slowest and version 6 is the fastest - tested on CUDA architecture 1.0 - 1.3.
 * Version 1 is the slowest and version 6 is the fastest (can be found in cuda-long-vector-kernels.h)- tested on CUDA architecture 1.0 - 1.3.
 * Another improvements are possible for the future devices.
 *
 */