Skip to content
Snippets Groups Projects
tnl-benchmarks.h 12.4 KiB
Newer Older
  • Learn to ignore specific revisions
  • /***************************************************************************
                              tnl-benchmarks.h  -  description
                                 -------------------
        begin                : Jan 27, 2010
        copyright            : (C) 2010 by Tomas Oberhuber
        email                : tomas.oberhuber@fjfi.cvut.cz
     ***************************************************************************/
    
    /***************************************************************************
     *                                                                         *
     *   This program is free software; you can redistribute it and/or modify  *
     *   it under the terms of the GNU General Public License as published by  *
     *   the Free Software Foundation; either version 2 of the License, or     *
     *   (at your option) any later version.                                   *
     *                                                                         *
     ***************************************************************************/
    
    #ifndef TNLBENCHMARKS_H_
    #define TNLBENCHMARKS_H_
    
    #include <core/mfuncs.h>
    
    Tomáš Oberhuber's avatar
    Tomáš Oberhuber committed
    #include <core/tnlTimerCPU.h>
    #include <../tests/unit-tests/core/tnl-cuda-kernels.h>
    
    #include <core/low-level/cuda-long-vector-kernels.h>
    
    
    template< class T >
    bool transferBenchmark( const int size,
                            double& host_to_host_band_width,
                            double& host_to_device_band_width,
                            double& device_to_host_band_width,
                            double& device_to_device_band_width )
    {
    
    
      tnlVector< T > host_vector( "transferBenchmark:host-vector", size );
      tnlVector< T > host_vector2( "transferBenchmark:host-vector-2", size );
      tnlVector< T, tnlCuda > device_vector( "transferBenchmark:device-vector", size );
      tnlVector< T, tnlCuda > device_vector2( "transferBenchmark:device-vector-2", size );
    
    
       for( int i = 0; i < size; i ++ )
          host_vector[ i ] = i + 1;
    
       const long int cycles = 100;
       long int bytes = cycles * size * sizeof( int );
       long int mega_byte = 1 << 20;
    
    
    Tomáš Oberhuber's avatar
    Tomáš Oberhuber committed
       tnlTimerCPU timer;
    
          host_vector2 = host_vector;
    
    
       double time = timer. GetTime();
       double giga_byte = ( double ) ( 1 << 30 );
       host_to_host_band_width = bytes / giga_byte / time;
    
       cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl;
    
       timer. Reset();
       for( int i = 0; i < cycles; i ++ )
    
          device_vector = host_vector;
    
    
       time = timer. GetTime();
       host_to_device_band_width = bytes / giga_byte / time;
    
       cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl;
    
       timer. Reset();
       for( int i = 0; i < cycles; i ++ )
    
          host_vector2 = device_vector;
    
    
       time = timer. GetTime();
       device_to_host_band_width = bytes / giga_byte / time;
    
       cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl;
    
       timer. Reset();
       for( int i = 0; i < cycles; i ++ )
    
          device_vector2 = device_vector;
    
    
    
       time = timer. GetTime();
    
       // Since we read and write tha data back we process twice as many bytes.
       bytes *= 2;
       device_to_device_band_width = bytes / giga_byte / time;
    
       cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl;
    }
    
    template< class T >
    
    void tnlCPUReductionSum( const tnlVector< T >& host_vector,
    
       const T* data = host_vector. getData();
    
       const int size = host_vector. getSize();
       sum = 0.0;
       for( int i = 0; i < size; i ++ )
          sum += data[ i ];
    };
    
    template< class T >
    
    void tnlCPUReductionMin( const tnlVector< T >& host_vector,
    
       const T* data = host_vector. getData();
    
       const int size = host_vector. getSize();
       //tnlAssert( data );
       min = data[ 0 ];
       for( int i = 1; i < size; i ++ )
          min = :: Min( min,  data[ i ] );
    };
    
    template< class T >
    
    void tnlCPUReductionMax( const tnlVector< T >& host_vector,
    
       const T* data = host_vector. getData();
    
       const int size = host_vector. getSize();
       //tnlAssert( data );
       max = data[ 0 ];
       for( int i = 1; i < size; i ++ )
          max = :: Max( max,  data[ i ] );
    };
    
    template< class T >
    void reductionBenchmark( const int size,
                             const int algorithm )
    {
    
       tnlVector< T > host_vector( "reductionBenchmark:host-vector", size );
       tnlVector< T, tnlCuda > device_vector( "reductionBenchmark:device-vector", size );
       tnlVector< T, tnlCuda > device_aux( "reductionBenchmark:device-aux", size / 2 );
    
       device_vector = host_vector;
    
       const long int reducing_cycles( 1 );
    
       tnlTimerCPU timer;
    
       timer. Reset();
       for( int i = 0; i < reducing_cycles; i ++ )
       {
          switch( algorithm )
          {
             case 0:  // reduction on CPU
                tnlCPUReductionSum( host_vector, sum );
                tnlCPUReductionMin( host_vector, sum );
                tnlCPUReductionMax( host_vector, sum );
    
    #ifdef HAVE_CUDA
    
                tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size,
    
                                                                       device_vector. getData(),
    
                                                                       device_aux. getData() );
    
                tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size,
    
                                                                                  device_vector. getData(),
    
                                                                                  NULL,
                                                                                  sum,
                                                                                  0.0,
    
                                                                                  device_aux. getData() );
    
                tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size,
    
                                                                                  device_vector. getData(),
    
                                                                                  NULL,
                                                                                  min,
                                                                                  0.0,
    
                                                                                  device_aux. getData() );
    
                tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size,
    
                                                                                  device_vector. getData(),
    
                                                                                  NULL,
                                                                                  max,
                                                                                  0.0,
    
                                                                                  device_aux. getData() );
    
    
          }
       }
       const double time = timer. GetTime();
       double giga_byte = ( double ) ( 1 << 30 );
       long int mega_byte = 1 << 20;
       long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3;
       const double reduction_band_width = bytes_reduced / giga_byte / time;
    
       cout << "Reducing " << bytes_reduced / mega_byte
            << " MB on DEVICE using algorithm " << algorithm
            << " took " << time
            << " seconds. Bandwidth is " << reduction_band_width
            << " GB/s." << endl;
    }
    
    #endif /* TNLBENCHMARKS_H_ */