Loading tests/tnl-benchmarks.h +92 −85 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ #include <core/mfuncs.h> #include <core/tnlTimerCPU.h> #include <../tests/unit-tests/core/tnl-cuda-kernels.h> #include <core/low-level/cuda-long-vector-kernels.h> template< class T > bool transferBenchmark( const int size, Loading @@ -45,8 +46,8 @@ bool transferBenchmark( const int size, tnlTimerCPU timer; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( host_vector ) ) return false; host_vector2 = host_vector; double time = timer. GetTime(); double giga_byte = ( double ) ( 1 << 30 ); host_to_host_band_width = bytes / giga_byte / time; Loading @@ -55,8 +56,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector. copyFrom( host_vector ) ) return false; device_vector = host_vector; time = timer. GetTime(); host_to_device_band_width = bytes / giga_byte / time; Loading @@ -64,8 +65,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( device_vector ) ) return false; host_vector2 = device_vector; time = timer. GetTime(); device_to_host_band_width = bytes / giga_byte / time; Loading @@ -73,8 +74,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector2. copyFrom( device_vector ) ) return false; device_vector2 = device_vector; time = timer. GetTime(); Loading @@ -89,7 +90,7 @@ template< class T > void tnlCPUReductionSum( const tnlLongVector< T >& host_vector, T& sum ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); sum = 0.0; for( int i = 0; i < size; i ++ ) Loading @@ -100,7 +101,7 @@ template< class T > void tnlCPUReductionMin( const tnlLongVector< T >& host_vector, T& min ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); //tnlAssert( data ); min = data[ 0 ]; Loading @@ -112,7 +113,7 @@ template< class T > void tnlCPUReductionMax( const tnlLongVector< T >& host_vector, T& max ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); //tnlAssert( data ); max = data[ 0 ]; Loading @@ -131,12 +132,12 @@ void reductionBenchmark( const int size, for( int i = 0; i < size; i ++ ) host_vector[ i ] = i + 1; device_vector. copyFrom( host_vector ); device_vector = host_vector; T sum, min, max; const long int reducing_cycles( 10 ); tnlTimerCUDA timer; tnlTimerCPU timer; timer. Reset(); for( int i = 0; i < reducing_cycles; i ++ ) { Loading @@ -147,87 +148,93 @@ void reductionBenchmark( const int size, tnlCPUReductionMin( host_vector, sum ); tnlCPUReductionMax( host_vector, sum ); case 1: tnlCUDASimpleReduction1Sum( size, tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction1Min( size, tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction1Max( size, tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 2: tnlCUDASimpleReduction2Sum( size, tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction2Min( size, tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction2Max( size, tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 3: tnlCUDASimpleReduction3Sum( size, tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction3Min( size, tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction3Max( size, tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 4: tnlCUDASimpleReduction4Sum( size, tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction4Min( size, tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction4Max( size, tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 5: tnlCUDASimpleReduction5Sum( size, tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction5Min( size, tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction5Max( size, tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; default: tnlCUDAReductionSum( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size, device_vector. getVector(), NULL, sum, 0.0, device_aux. getVector() ); tnlCUDAReductionMin( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size, device_vector. getVector(), NULL, min, 0.0, device_aux. getVector() ); tnlCUDAReductionMax( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size, device_vector. getVector(), NULL, max, 0.0, device_aux. getVector() ); } Loading tests/unit-tests/core/tnl-cuda-kernels.h +1 −1 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ using namespace std; * * * For the educative and also testing/debuging reasons we have 6 version of this algorithm here. * Version 1 is the slowest and version 6 is the fastest - tested on CUDA architecture 1.0 - 1.3. * Version 1 is the slowest and version 6 is the fastest (can be found in cuda-long-vector-kernels.h)- tested on CUDA architecture 1.0 - 1.3. * Another improvements are possible for the future devices. * */ Loading Loading
tests/tnl-benchmarks.h +92 −85 Original line number Diff line number Diff line Loading @@ -21,6 +21,7 @@ #include <core/mfuncs.h> #include <core/tnlTimerCPU.h> #include <../tests/unit-tests/core/tnl-cuda-kernels.h> #include <core/low-level/cuda-long-vector-kernels.h> template< class T > bool transferBenchmark( const int size, Loading @@ -45,8 +46,8 @@ bool transferBenchmark( const int size, tnlTimerCPU timer; timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( host_vector ) ) return false; host_vector2 = host_vector; double time = timer. GetTime(); double giga_byte = ( double ) ( 1 << 30 ); host_to_host_band_width = bytes / giga_byte / time; Loading @@ -55,8 +56,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector. copyFrom( host_vector ) ) return false; device_vector = host_vector; time = timer. GetTime(); host_to_device_band_width = bytes / giga_byte / time; Loading @@ -64,8 +65,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! host_vector2. copyFrom( device_vector ) ) return false; host_vector2 = device_vector; time = timer. GetTime(); device_to_host_band_width = bytes / giga_byte / time; Loading @@ -73,8 +74,8 @@ bool transferBenchmark( const int size, timer. Reset(); for( int i = 0; i < cycles; i ++ ) if( ! device_vector2. copyFrom( device_vector ) ) return false; device_vector2 = device_vector; time = timer. GetTime(); Loading @@ -89,7 +90,7 @@ template< class T > void tnlCPUReductionSum( const tnlLongVector< T >& host_vector, T& sum ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); sum = 0.0; for( int i = 0; i < size; i ++ ) Loading @@ -100,7 +101,7 @@ template< class T > void tnlCPUReductionMin( const tnlLongVector< T >& host_vector, T& min ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); //tnlAssert( data ); min = data[ 0 ]; Loading @@ -112,7 +113,7 @@ template< class T > void tnlCPUReductionMax( const tnlLongVector< T >& host_vector, T& max ) { const T* data = host_vector. Data(); const T* data = host_vector. getVector(); const int size = host_vector. getSize(); //tnlAssert( data ); max = data[ 0 ]; Loading @@ -131,12 +132,12 @@ void reductionBenchmark( const int size, for( int i = 0; i < size; i ++ ) host_vector[ i ] = i + 1; device_vector. copyFrom( host_vector ); device_vector = host_vector; T sum, min, max; const long int reducing_cycles( 10 ); tnlTimerCUDA timer; tnlTimerCPU timer; timer. Reset(); for( int i = 0; i < reducing_cycles; i ++ ) { Loading @@ -147,87 +148,93 @@ void reductionBenchmark( const int size, tnlCPUReductionMin( host_vector, sum ); tnlCPUReductionMax( host_vector, sum ); case 1: tnlCUDASimpleReduction1Sum( size, tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction1Min( size, tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction1Max( size, tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 2: tnlCUDASimpleReduction2Sum( size, tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction2Min( size, tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction2Max( size, tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 3: tnlCUDASimpleReduction3Sum( size, tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction3Min( size, tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction3Max( size, tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 4: tnlCUDASimpleReduction4Sum( size, tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction4Min( size, tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction4Max( size, tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; case 5: tnlCUDASimpleReduction5Sum( size, tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size, device_vector. getVector(), sum, device_aux. getVector() ); tnlCUDASimpleReduction5Min( size, tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size, device_vector. getVector(), min, device_aux. getVector() ); tnlCUDASimpleReduction5Max( size, tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size, device_vector. getVector(), max, device_aux. getVector() ); break; default: tnlCUDAReductionSum( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size, device_vector. getVector(), NULL, sum, 0.0, device_aux. getVector() ); tnlCUDAReductionMin( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size, device_vector. getVector(), NULL, min, 0.0, device_aux. getVector() ); tnlCUDAReductionMax( size, tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size, device_vector. getVector(), NULL, max, 0.0, device_aux. getVector() ); } Loading
tests/unit-tests/core/tnl-cuda-kernels.h +1 −1 Original line number Diff line number Diff line Loading @@ -37,7 +37,7 @@ using namespace std; * * * For the educative and also testing/debuging reasons we have 6 version of this algorithm here. * Version 1 is the slowest and version 6 is the fastest - tested on CUDA architecture 1.0 - 1.3. * Version 1 is the slowest and version 6 is the fastest (can be found in cuda-long-vector-kernels.h)- tested on CUDA architecture 1.0 - 1.3. * Another improvements are possible for the future devices. * */ Loading