Implementing CUDA parallel reduction. (025646c7) · Commits · TNL / tnl-dev

src/core/tnl-cuda-kernels.h

+51 −38

Original line number	Diff line number	Diff line
		@@ -627,13 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		sdata[ tid ] = d_input[ gid ];
		}
		__syncthreads();
		dbg_array1[ tid ] = sdata[ tid ];
		//dbg_array1[ tid ] = tid; //sdata[ tid ];

		// Parallel reduction
		int n = last_tid < blockDim. x ? last_tid : blockDim. x;
		for( int s = n / 2; s > 0; s >>= 1 )
		{
		if( tid < s && tid + s < n )
		if( tid < s )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		@@ -645,16 +645,15 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		/* This is for the case when we have odd number of elements.
		* The last one will be reduced using the thread with ID 0.
		*/
		if( 2 * s < n && tid == 0 )
		if( 2 * s < n && tid == n - 1 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] );
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] );
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ tid ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ n ];
		dbg_array1[ n ] = -555; //sdata[ 0 ];

		sdata[ 0 ] += sdata[ tid ];
		dbg_array1[ 0 ] = sdata[ tid ];
		}
		n = s;

		@@ -663,7 +662,7 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,

		}

		// Store the result back in global memory
		// Store the result back in the global memory
		if( tid == 0 )
		d_output[ blockIdx. x ] = sdata[ 0 ];
		}
		@@ -692,7 +691,7 @@ bool tnlCUDASimpleReduction4( const int size,
		}
		device_output_allocated = true;

		cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		@@ -706,19 +705,19 @@ bool tnlCUDASimpleReduction4( const int size,
		if( grid_size. x * 2 * block_size. x < size_reduced )
		grid_size. x ++;
		shmem = block_size. x * sizeof( T );
		cout << "Size: " << size_reduced
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		<< " Block size: " << block_size. x
		<< " Shmem: " << shmem << endl;
		<< " Shmem: " << shmem << endl;*/
		tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		size_reduced = grid_size. x;
		reduction_input = device_output;

		// debuging part
		T* host_array = new T[ desBlockSize ];
		/T host_array = new T[ desBlockSize ];
		cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
		cout << host_array[ i ] << " - ";
		cout << host_array[ i ] << " ";
		cout << endl;

		T* output = new T[ size_reduced ];
		@@ -727,7 +726,7 @@ bool tnlCUDASimpleReduction4( const int size,
		for( int i = 0; i < size_reduced; i ++ )
		cout << output[ i ] << " ";
		cout << endl;
		delete[] output;
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		@@ -772,7 +771,8 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size,
		__syncthreads();

		// Parallel reduction
		for( int s = blockDim. x / 2; s > 0; s >>= 1 )
		int n = last_tid < blockDim. x ? last_tid : blockDim. x;
		for( int s = n / 2; s > 0; s >>= 1 )
		{
		if( tid < s && tid + s < last_tid )
		{
		@@ -783,6 +783,19 @@ __global__ void tnlCUDASimpleReductionKernel3( const int size,
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + s ];
		}
		/* This is for the case when we have odd number of elements.
		* The last one will be reduced using the thread with ID 0.
		*/
		if( 2 * s < n && tid == 0 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n - 1 ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n - 1 ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ n - 1 ];
		}
		n = s;
		__syncthreads();
		}

src/core/tnlCUDAKernelsTester.h

+34 −22

Original line number	Diff line number	Diff line
		@@ -156,7 +156,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		}


		if( min == seq_min )
		/*if( min == seq_min )
		cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl;
		else
		cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl;
		@@ -167,11 +167,16 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		if( sum == seq_sum )
		cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl;
		else
		cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;
		cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;*/

		T param;
		if( GetParameterType( param ) == "float" )
		if( GetParameterType( param ) == "float" \|\|
		GetParameterType( param ) == "double" )
		{
		if( min != seq_min )
		cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		if( max != seq_max )
		cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		if( sum == 0.0 )
		@@ -181,11 +186,19 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		else
		{
		double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
		if( fabs( diff > 1.0e-5 ) )
		cerr << "Diff is " << diff << " for " << GetParameterType( param ) << endl;
		CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
		}
		}
		else
		{
		if( min != seq_min )
		cerr << "Diff. min = " << min << " seq. min = " << seq_min;
		if( max != seq_max )
		cerr << "Diff. max = " << max << " seq. max = " << seq_max;
		if( sum != seq_sum )
		cerr << "Diff. sum = " << sum << " seq. sum = " << seq_sum;
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		CPPUNIT_ASSERT( sum == seq_sum );
		@@ -197,10 +210,11 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		{
		tnlLongVector< T > host_input;
		int size = 2;
		/*for( int s = 1; s < 12; s ++ )
		for( int s = 1; s < 12; s ++ )
		{
		tnlLongVector< T > host_input( size );

		cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 0.0;
		mainReduction( host_input,
		@@ -208,20 +222,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 1.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = i;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		mainReduction( host_input,
		@@ -229,46 +244,43 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		256,
		2048 );
		size *= 2;
		}*/
		for( size = 257; size < 5000; size ++ )
		cout << endl;
		}
		for( size = 1; size < 5000; size ++ )
		{
		cout << "*********** Size is " << size << " ***************** " << endl;
		tnlLongVector< T > host_input( size );

		/*for( int i = 0; i < size; i ++ )
		cout << "Alg. " << algorithm_efficiency << "Testing zeros with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 0.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );*/
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing ones with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 1.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing linear sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = i;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		cout << "Alg. " << algorithm_efficiency << "Testing quadratic sequence with size " << size << " ";
		for( int i = 0; i < size; i ++ )
		host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		cout << endl;
		}





		};

		void testReduction()
		@@ -298,13 +310,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		void testSimpleReduction2()
		{
		cout << "Test reduction 2" << endl;
		//testReduction( 2 );
		testReduction( 2 );
		};

		void testSimpleReduction1()
		{
		cout << "Test reduction 1" << endl;
		//testReduction( 1 );
		testReduction( 1 );
		};