/*************************************************************************** tnl-benchmarks.h - description ------------------- begin : Jan 27, 2010 copyright : (C) 2010 by Tomas Oberhuber email : tomas.oberhuber@fjfi.cvut.cz ***************************************************************************/ /*************************************************************************** * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU General Public License as published by * * the Free Software Foundation; either version 2 of the License, or * * (at your option) any later version. * * * ***************************************************************************/ #ifndef TNLBENCHMARKS_H_ #define TNLBENCHMARKS_H_ #include <core/mfuncs.h> #include <core/tnlTimerCUDA.h> template< class T > bool transferBenchmark( const int size, double& host_to_host_band_width, double& host_to_device_band_width, double& device_to_host_band_width, double& device_to_device_band_width ) { tnlLongVector< T > host_vector( "transferBenchmark:host-vector", size ); tnlLongVector< T > host_vector2( "transferBenchmark:host-vector-2", size ); tnlLongVector< T, tnlCuda > device_vector( "transferBenchmark:device-vector", size ); tnlLongVector< T, tnlCuda > device_vector2( "transferBenchmark:device-vector-2", size ); for( int i = 0; i < size; i ++ ) host_vector[ i ] = i + 1; const long int cycles = 100; long int bytes = cycles * size * sizeof( int ); long int mega_byte = 1 << 20; tnlTimerCUDA timer; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( host_vector ) ) return false; double time = timer. GetTime(); double giga_byte = ( double ) ( 1 << 30 ); host_to_host_band_width = bytes / giga_byte / time; cout << "Transfering " << bytes / mega_byte << " MB from HOST to HOST took " << time << " seconds. Bandwidth is " << host_to_host_band_width << " GB/s." << endl; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector. copyFrom( host_vector ) ) return false; time = timer. GetTime(); host_to_device_band_width = bytes / giga_byte / time; cout << "Transfering " << bytes / mega_byte << " MB from HOST to DEVICE took " << time << " seconds. Bandwidth is " << host_to_device_band_width << " GB/s." << endl; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( device_vector ) ) return false; time = timer. GetTime(); device_to_host_band_width = bytes / giga_byte / time; cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to HOST took " << time << " seconds. Bandwidth is " << device_to_host_band_width << " GB/s." << endl; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector2. copyFrom( device_vector ) ) return false; time = timer. GetTime(); // Since we read and write tha data back we process twice as many bytes. bytes *= 2; device_to_device_band_width = bytes / giga_byte / time; cout << "Transfering " << bytes / mega_byte << " MB from DEVICE to DEVICE took " << time << " seconds. Bandwidth is " << device_to_device_band_width << " GB/s." << endl; } template< class T > void tnlCPUReductionSum( const tnlLongVector< T >& host_vector, T& sum ) { const T* data = host_vector. Data(); const int size = host_vector. getSize(); sum = 0.0; for( int i = 0; i < size; i ++ ) sum += data[ i ]; }; template< class T > void tnlCPUReductionMin( const tnlLongVector< T >& host_vector, T& min ) { const T* data = host_vector. Data(); const int size = host_vector. getSize(); //tnlAssert( data ); min = data[ 0 ]; for( int i = 1; i < size; i ++ ) min = :: Min( min, data[ i ] ); }; template< class T > void tnlCPUReductionMax( const tnlLongVector< T >& host_vector, T& max ) { const T* data = host_vector. Data(); const int size = host_vector. getSize(); //tnlAssert( data ); max = data[ 0 ]; for( int i = 1; i < size; i ++ ) max = :: Max( max, data[ i ] ); }; template< class T > void reductionBenchmark( const int size, const int algorithm ) { tnlLongVector< T > host_vector( "reductionBenchmark:host-vector", size ); tnlLongVector< T, tnlCuda > device_vector( "reductionBenchmark:device-vector", size ); tnlLongVector< T, tnlCuda > device_aux( "reductionBenchmark:device-aux", size / 2 ); for( int i = 0; i < size; i ++ ) host_vector[ i ] = i + 1; device_vector. copyFrom( host_vector ); T sum, min, max; const long int reducing_cycles( 10 ); tnlTimerCUDA timer; timer. Reset(); for( int i = 0; i < reducing_cycles; i ++ ) { switch( algorithm ) { case 0: // reduction on CPU tnlCPUReductionSum( host_vector, sum ); tnlCPUReductionMin( host_vector, sum ); tnlCPUReductionMax( host_vector, sum ); case 1: tnlCUDASimpleReduction1Sum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDASimpleReduction1Min( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDASimpleReduction1Max( size, device_vector. Data(), max, device_aux. Data() ); break; case 2: tnlCUDASimpleReduction2Sum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDASimpleReduction2Min( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDASimpleReduction2Max( size, device_vector. Data(), max, device_aux. Data() ); break; case 3: tnlCUDASimpleReduction3Sum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDASimpleReduction3Min( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDASimpleReduction3Max( size, device_vector. Data(), max, device_aux. Data() ); break; case 4: tnlCUDASimpleReduction4Sum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDASimpleReduction4Min( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDASimpleReduction4Max( size, device_vector. Data(), max, device_aux. Data() ); break; case 5: tnlCUDASimpleReduction5Sum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDASimpleReduction5Min( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDASimpleReduction5Max( size, device_vector. Data(), max, device_aux. Data() ); break; default: tnlCUDAReductionSum( size, device_vector. Data(), sum, device_aux. Data() ); tnlCUDAReductionMin( size, device_vector. Data(), min, device_aux. Data() ); tnlCUDAReductionMax( size, device_vector. Data(), max, device_aux. Data() ); } } const double time = timer. GetTime(); double giga_byte = ( double ) ( 1 << 30 ); long int mega_byte = 1 << 20; long int bytes_reduced = size * sizeof( T ) * reducing_cycles * 3; const double reduction_band_width = bytes_reduced / giga_byte / time; cout << "Reducing " << bytes_reduced / mega_byte << " MB on DEVICE using algorithm " << algorithm << " took " << time << " seconds. Bandwidth is " << reduction_band_width << " GB/s." << endl; } #endif /* TNLBENCHMARKS_H_ */