Debugging TNL benchmarks. (632205a1) · Commits · TNL / tnl-dev

tests/tnl-benchmarks.h

+92 −85

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@
		#include <core/mfuncs.h>
		#include <core/tnlTimerCPU.h>
		#include <../tests/unit-tests/core/tnl-cuda-kernels.h>
		#include <core/low-level/cuda-long-vector-kernels.h>

		template< class T >
		bool transferBenchmark( const int size,
		@@ -45,8 +46,8 @@ bool transferBenchmark( const int size,
		tnlTimerCPU timer;
		timer. Reset();
		for( int i = 0; i < cycles; i ++ )
		if( ! host_vector2. copyFrom( host_vector ) )
		return false;
		host_vector2 = host_vector;

		double time = timer. GetTime();
		double giga_byte = ( double ) ( 1 << 30 );
		host_to_host_band_width = bytes / giga_byte / time;
		@@ -55,8 +56,8 @@ bool transferBenchmark( const int size,

		timer. Reset();
		for( int i = 0; i < cycles; i ++ )
		if( ! device_vector. copyFrom( host_vector ) )
		return false;
		device_vector = host_vector;

		time = timer. GetTime();
		host_to_device_band_width = bytes / giga_byte / time;

		@@ -64,8 +65,8 @@ bool transferBenchmark( const int size,

		timer. Reset();
		for( int i = 0; i < cycles; i ++ )
		if( ! host_vector2. copyFrom( device_vector ) )
		return false;
		host_vector2 = device_vector;

		time = timer. GetTime();
		device_to_host_band_width = bytes / giga_byte / time;

		@@ -73,8 +74,8 @@ bool transferBenchmark( const int size,

		timer. Reset();
		for( int i = 0; i < cycles; i ++ )
		if( ! device_vector2. copyFrom( device_vector ) )
		return false;
		device_vector2 = device_vector;


		time = timer. GetTime();

		@@ -89,7 +90,7 @@ template< class T >
		void tnlCPUReductionSum( const tnlLongVector< T >& host_vector,
		T& sum )
		{
		const T* data = host_vector. Data();
		const T* data = host_vector. getVector();
		const int size = host_vector. getSize();
		sum = 0.0;
		for( int i = 0; i < size; i ++ )
		@@ -100,7 +101,7 @@ template< class T >
		void tnlCPUReductionMin( const tnlLongVector< T >& host_vector,
		T& min )
		{
		const T* data = host_vector. Data();
		const T* data = host_vector. getVector();
		const int size = host_vector. getSize();
		//tnlAssert( data );
		min = data[ 0 ];
		@@ -112,7 +113,7 @@ template< class T >
		void tnlCPUReductionMax( const tnlLongVector< T >& host_vector,
		T& max )
		{
		const T* data = host_vector. Data();
		const T* data = host_vector. getVector();
		const int size = host_vector. getSize();
		//tnlAssert( data );
		max = data[ 0 ];
		@@ -131,12 +132,12 @@ void reductionBenchmark( const int size,
		for( int i = 0; i < size; i ++ )
		host_vector[ i ] = i + 1;

		device_vector. copyFrom( host_vector );
		device_vector = host_vector;

		T sum, min, max;
		const long int reducing_cycles( 10 );

		tnlTimerCUDA timer;
		tnlTimerCPU timer;
		timer. Reset();
		for( int i = 0; i < reducing_cycles; i ++ )
		{
		@@ -147,87 +148,93 @@ void reductionBenchmark( const int size,
		tnlCPUReductionMin( host_vector, sum );
		tnlCPUReductionMax( host_vector, sum );
		case 1:
		tnlCUDASimpleReduction1Sum( size,
		tnlCUDASimpleReduction1< T, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		sum,
		device_aux. getVector() );
		tnlCUDASimpleReduction1Min( size,
		tnlCUDASimpleReduction1< T, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		min,
		device_aux. getVector() );
		tnlCUDASimpleReduction1Max( size,
		tnlCUDASimpleReduction1< T, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		max,
		device_aux. getVector() );
		break;
		case 2:
		tnlCUDASimpleReduction2Sum( size,
		tnlCUDASimpleReduction2< T, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		sum,
		device_aux. getVector() );
		tnlCUDASimpleReduction2Min( size,
		tnlCUDASimpleReduction2< T, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		min,
		device_aux. getVector() );
		tnlCUDASimpleReduction2Max( size,
		tnlCUDASimpleReduction2< T, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		max,
		device_aux. getVector() );
		break;
		case 3:
		tnlCUDASimpleReduction3Sum( size,
		tnlCUDASimpleReduction3< T, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		sum,
		device_aux. getVector() );
		tnlCUDASimpleReduction3Min( size,
		tnlCUDASimpleReduction3< T, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		min,
		device_aux. getVector() );
		tnlCUDASimpleReduction3Max( size,
		tnlCUDASimpleReduction3< T, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		max,
		device_aux. getVector() );
		break;
		case 4:
		tnlCUDASimpleReduction4Sum( size,
		tnlCUDASimpleReduction4< T, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		sum,
		device_aux. getVector() );
		tnlCUDASimpleReduction4Min( size,
		tnlCUDASimpleReduction4< T, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		min,
		device_aux. getVector() );
		tnlCUDASimpleReduction4Max( size,
		tnlCUDASimpleReduction4< T, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		max,
		device_aux. getVector() );
		break;
		case 5:
		tnlCUDASimpleReduction5Sum( size,
		tnlCUDASimpleReduction5< T, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		sum,
		device_aux. getVector() );
		tnlCUDASimpleReduction5Min( size,
		tnlCUDASimpleReduction5< T, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		min,
		device_aux. getVector() );
		tnlCUDASimpleReduction5Max( size,
		tnlCUDASimpleReduction5< T, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		max,
		device_aux. getVector() );
		break;
		default:
		tnlCUDAReductionSum( size,
		tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionSum >( size,
		device_vector. getVector(),
		NULL,
		sum,
		0.0,
		device_aux. getVector() );
		tnlCUDAReductionMin( size,
		tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMin >( size,
		device_vector. getVector(),
		NULL,
		min,
		0.0,
		device_aux. getVector() );
		tnlCUDAReductionMax( size,
		tnlCUDALongVectorReduction< T, T, int, tnlParallelReductionMax >( size,
		device_vector. getVector(),
		NULL,
		max,
		0.0,
		device_aux. getVector() );

		}

tests/unit-tests/core/tnl-cuda-kernels.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -37,7 +37,7 @@ using namespace std;
		*
		*
		* For the educative and also testing/debuging reasons we have 6 version of this algorithm here.
		* Version 1 is the slowest and version 6 is the fastest - tested on CUDA architecture 1.0 - 1.3.
		* Version 1 is the slowest and version 6 is the fastest (can be found in cuda-long-vector-kernels.h)- tested on CUDA architecture 1.0 - 1.3.
		* Another improvements are possible for the future devices.
		*
		*/