Implementing CUDA parallel reduction - problem with odd number of elements in a block. (016f90e1) · Commits · TNL / tnl-dev

src/core/tnl-cuda-kernels.h

+85 −39

Original line number	Diff line number	Diff line
		@@ -60,17 +60,19 @@ __global__ void tnlCUDAReductionKernel( const int size,

		// Read data into the shared memory
		int tid = threadIdx. x;
		int gid = 2 * blockSize * blockDim. x + threadIdx. x;
		int gid = 2 * blockDim. x * blockIdx. x + threadIdx. x;

		if( gid + blockSize < size )
		{
		if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
		//dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -1;
		}
		else if( gid < size )
		{
		sdata[ tid ] = d_input[ gid ];
		//dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -2;
		}
		gid += grid_size;

		@@ -80,8 +82,9 @@ __global__ void tnlCUDAReductionKernel( const int size,
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ) );
		if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
		gid += grid_size;
		//dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = -4;
		}
		dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ];
		//dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ];
		__syncthreads();

		if( gid + blockDim. x < size )
		@@ -216,6 +219,9 @@ bool tnlCUDAReduction( const int size,
		T& result,
		T* device_output = 0 )
		{
		if( ! size ) return false;


		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 16; //Desired block size
		@@ -235,7 +241,7 @@ bool tnlCUDAReduction( const int size,
		}
		device_output_allocated = true;

		cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		@@ -247,40 +253,40 @@ bool tnlCUDAReduction( const int size,
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
		shmem = block_size. x * sizeof( T );
		cout << "Size: " << size_reduced
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		<< " Block size: " << block_size. x
		<< " Shmem: " << shmem << endl;
		<< " Shmem: " << shmem << endl;*/
		tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." << endl; );
		tnlAssert( block_size. x <= 512, cerr << "Block size is " << block_size. x << endl; );
		switch( block_size. x )
		{
		case 512:
		tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 256:
		tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 128:
		tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 64:
		tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 32:
		tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 16:
		tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 8:
		tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 4:
		tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 2:
		tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, 2 * grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );
		@@ -293,21 +299,24 @@ bool tnlCUDAReduction( const int size,
		reduction_input = device_output;

		// debuging part
		T* host_array = new T[ size ];
		/T host_array = new T[ size ];
		cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< size; i ++ )
		cout << host_array[ i ] << " - ";
		cout << endl;
		cout << endl;*/

		T* output = new T[ size_reduced ];
		/T output = new T[ size_reduced ];
		cudaMemcpy( output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		cout << endl;
		for( int i = 0; i < size_reduced; i ++ )
		cout << output[ i ] << " ";
		cout << endl;
		delete[] output;
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )
		@@ -563,6 +572,9 @@ bool tnlCUDASimpleReduction5( const int size,
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )
		@@ -615,11 +627,13 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		sdata[ tid ] = d_input[ gid ];
		}
		__syncthreads();
		dbg_array1[ tid ] = sdata[ tid ];

		// Parallel reduction
		for( int s = blockDim. x / 2; s > 0; s >>= 1 )
		int n = last_tid < blockDim. x ? last_tid : blockDim. x;
		for( int s = n / 2; s > 0; s >>= 1 )
		{
		if( tid < s && tid + s < last_tid )
		if( tid < s && tid + s < n )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		@@ -628,7 +642,25 @@ __global__ void tnlCUDASimpleReductionKernel4( const int size,
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + s ];
		}
		/* This is for the case when we have odd number of elements.
		* The last one will be reduced using the thread with ID 0.
		*/
		if( 2 * s < n && tid == 0 )
		{
		if( operation == tnlMin )
		sdata[ 0 ] = tnlCudaMin( sdata[ 0 ], sdata[ n ] );
		if( operation == tnlMax )
		sdata[ 0 ] = tnlCudaMax( sdata[ 0 ], sdata[ n ] );
		if( operation == tnlSum )
		sdata[ 0 ] += sdata[ n ];
		dbg_array1[ n ] = -555; //sdata[ 0 ];

		}
		n = s;

		__syncthreads();
		//dbg_array1[ tid ] = -sdata[ tid ];

		}

		// Store the result back in global memory
		@@ -660,7 +692,7 @@ bool tnlCUDASimpleReduction4( const int size,
		}
		device_output_allocated = true;

		//cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		@@ -670,18 +702,20 @@ bool tnlCUDASimpleReduction4( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = ( size_reduced / block_size. x + 1 ) / 2;
		grid_size. x = size_reduced / block_size. x / 2;
		if( grid_size. x * 2 * block_size. x < size_reduced )
		grid_size. x ++;
		shmem = block_size. x * sizeof( T );
		/*cout << "Size: " << size_reduced
		cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		<< " Block size: " << block_size. x
		<< " Shmem: " << shmem << endl;*/
		<< " Shmem: " << shmem << endl;
		tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output, dbg_array1 );
		size_reduced = grid_size. x;
		reduction_input = device_output;

		// debuging part
		/T host_array = new T[ desBlockSize ];
		T* host_array = new T[ desBlockSize ];
		cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
		cout << host_array[ i ] << " - ";
		@@ -693,9 +727,12 @@ bool tnlCUDASimpleReduction4( const int size,
		for( int i = 0; i < size_reduced; i ++ )
		cout << output[ i ] << " ";
		cout << endl;
		delete[] output;*/
		delete[] output;
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )
		@@ -783,7 +820,7 @@ bool tnlCUDASimpleReduction3( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = size_reduced / block_size. x;
		grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
		shmem = block_size. x * sizeof( T );
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		@@ -803,6 +840,9 @@ bool tnlCUDASimpleReduction3( const int size,
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )
		@@ -891,7 +931,7 @@ bool tnlCUDASimpleReduction2( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = size_reduced / block_size. x;
		grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
		shmem = block_size. x * sizeof( T );
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		@@ -911,6 +951,9 @@ bool tnlCUDASimpleReduction2( const int size,
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )
		@@ -998,7 +1041,7 @@ bool tnlCUDASimpleReduction1( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = size_reduced / block_size. x;
		grid_size. x = size_reduced / block_size. x + ( size_reduced % block_size. x != 0 );
		shmem = block_size. x * sizeof( T );
		/*cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		@@ -1018,6 +1061,9 @@ bool tnlCUDASimpleReduction1( const int size,
		delete[] output;*/
		}
		T* host_output = new T[ size_reduced ];
		if( size == 1 )
		cudaMemcpy( host_output, device_input, sizeof( T ), cudaMemcpyDeviceToHost );
		else
		cudaMemcpy( host_output, device_output, size_reduced * sizeof( T ), cudaMemcpyDeviceToHost );
		result = host_output[ 0 ];
		for( int i = 1; i < size_reduced; i++ )

src/core/tnlCUDAKernelsTester.h

+127 −25

Original line number	Diff line number	Diff line
		@@ -19,6 +19,7 @@
		#define TNLCUDAKERNELSTESTER_H_

		#include <iostream>
		#include <math.h>
		#include <cppunit/TestSuite.h>
		#include <cppunit/TestResult.h>
		#include <cppunit/TestCaller.h>
		@@ -98,20 +99,21 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		return true;
		}

		void testReduction( int algorithm_efficiency = 0 )
		bool mainReduction( const tnlLongVector< T >& host_input,
		int algorithm_efficiency,
		const int desired_block_size,
		const int desired_grid_size )
		{
		int size = 32; 1024; //1<<10;
		int desBlockSize = 128; //Desired block size
		int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work

		tnlLongVector< T > host_input;
		const int size = host_input. GetSize();
		tnlLongVectorCUDA< T > device_input;
		CPPUNIT_ASSERT( testSetup( host_input,
		device_input,
		size ) );
		if( ! device_input. SetNewSize( size ) )
		return false;
		device_input. copyFrom( host_input );

		T seq_min( host_input[ 0 ] ),
		seq_max( host_input[ 0 ] ),
		seq_sum( host_input[ 0 ] );

		for( int i = 1; i < size; i ++ )
		{
		seq_min = :: Min( seq_min, host_input[ i ] );
		@@ -119,11 +121,6 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		seq_sum += host_input[ i ];
		}

		//Calculate necessary block/grid dimensions
		int block_size = :: Min( size/2, desBlockSize );
		//Grid size is limited in this case
		int grid_size = :: Min( desGridSize, size / block_size / 2 );

		T min, max, sum;
		switch( algorithm_efficiency )
		{
		@@ -159,13 +156,118 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		}


		cout << "Min: " << min << " Seq. min: " << seq_min << endl
		<< "Max: " << max << " Seq. max: " << seq_max << endl
		<< "Sum: " << sum << " Seq. sum: " << seq_sum << endl;
		if( min == seq_min )
		cout << "Min: " << min << " Seq. min: " << seq_min << " :-)" << endl;
		else
		cout << "Min: " << min << " Seq. min: " << seq_min << " !!!!!!!!!!" << endl;
		if( max == seq_max )
		cout << "Max: " << max << " Seq. max: " << seq_max << " :-)" << endl;
		else
		cout << "Max: " << max << " Seq. max: " << seq_max << " !!!!!!!!!!" << endl;
		if( sum == seq_sum )
		cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " :-)" << endl;
		else
		cout << "Sum: " << sum << " Seq. sum: " << seq_sum << " !!!!!!!!!!" << endl;

		T param;
		if( GetParameterType( param ) == "float" )
		{
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		if( sum == 0.0 )
		{
		CPPUNIT_ASSERT( sum == seq_sum );
		}
		else
		{
		double diff = ( ( double ) sum - ( double ) seq_sum ) / ( double) sum;
		CPPUNIT_ASSERT( fabs( diff ) < 1.0e-5 );
		}
		}
		else
		{
		CPPUNIT_ASSERT( min == seq_min );
		CPPUNIT_ASSERT( max == seq_max );
		CPPUNIT_ASSERT( sum == seq_sum );
		}

		}

		void testReduction( int algorithm_efficiency = 0 )
		{
		tnlLongVector< T > host_input;
		int size = 2;
		/*for( int s = 1; s < 12; s ++ )
		{
		tnlLongVector< T > host_input( size );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 0.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 1.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = i;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );
		size *= 2;
		}*/
		for( size = 257; size < 5000; size ++ )
		{
		cout << "*********** Size is " << size << " ***************** " << endl;
		tnlLongVector< T > host_input( size );

		/*for( int i = 0; i < size; i ++ )
		host_input[ i ] = 0.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );*/

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = 1.0;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = i;
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		for( int i = 0; i < size; i ++ )
		host_input[ i ] = ( i - size / 2 ) * ( i - size / 2 );
		mainReduction( host_input,
		algorithm_efficiency,
		256,
		2048 );

		}





		};

		@@ -196,13 +298,13 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		void testSimpleReduction2()
		{
		cout << "Test reduction 2" << endl;
		testReduction( 2 );
		//testReduction( 2 );
		};

		void testSimpleReduction1()
		{
		cout << "Test reduction 1" << endl;
		testReduction( 1 );
		//testReduction( 1 );
		};