Almost working simple parallel reduction in CUDA. (40cb098f) · Commits · TNL / tnl-dev

src/core/tnl-cuda-kernels.cu

+161 −1

Original line number	Diff line number	Diff line
		@@ -15,7 +15,11 @@
		* *
		***************************************************************************/

		#include <tnl-cuda-kernels.h>
		#include <iostream>
		#include <core/mfuncs.h>
		#include <core/tnl-cuda-kernels.h>

		using namespace std;

		int tnlCUDAReductionMin( const int size,
		const int block_size,
		@@ -90,4 +94,160 @@ double tnlCUDAReductionSum( const int size,
		return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
		}

		/*
		* Simple redcution 1
		*/

		int tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output )
		{
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 128; //Desired block size
		dim3 blockSize = :: Min( size, desBlockSize );
		dim3 gridSize = size / blockSize. x;
		unsigned int shmem = blockSize. x * sizeof( int );
		cout << "Grid size: " << gridSize. x << endl
		<< "Block size: " << blockSize. x << endl
		<< "Shmem: " << shmem << endl;
		tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
		int sizeReduced = gridSize. x;
		while( sizeReduced > cpuThreshold )
		{
		cout << "Reducing with size reduced = " << sizeReduced << endl;
		blockSize. x = :: Min( sizeReduced, desBlockSize );
		gridSize. x = sizeReduced / blockSize. x;
		shmem = blockSize. x * sizeof(int);
		tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
		sizeReduced = gridSize. x;
		}
		int* host_output = new int[ sizeReduced ];
		cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
		int result = host_output[ 0 ];
		for( int i = 1;i < sizeReduced; i++ )
		result = :: Min( result, host_output[ i ] );
		delete[] host_output;
		return result;
		}

		int tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output )
		{
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 128; //Desired block size
		dim3 blockSize = :: Min( size, desBlockSize );
		dim3 gridSize = size / blockSize. x;
		unsigned int shmem = blockSize. x * sizeof( int );
		cout << "Grid size: " << gridSize. x << endl
		<< "Block size: " << blockSize. x << endl
		<< "Shmem: " << shmem << endl;
		tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
		int sizeReduced = gridSize. x;
		while( sizeReduced > cpuThreshold )
		{
		cout << "Reducing with size reduced = " << sizeReduced << endl;
		blockSize. x = :: Min( sizeReduced, desBlockSize );
		gridSize. x = sizeReduced / blockSize. x;
		shmem = blockSize. x * sizeof(int);
		tnlCUDASimpleReductionKernel1< int, tnlMax ><<< gridSize, blockSize, shmem >>>( size, input, output );
		sizeReduced = gridSize. x;
		}
		int* host_output = new int[ sizeReduced ];
		cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
		int result = host_output[ 0 ];
		for( int i = 1;i < sizeReduced; i++ )
		result = :: Max( result, host_output[ i ] );
		delete[] host_output;
		return result;
		}

		int tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output )
		{
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 128; //Desired block size
		dim3 blockSize = :: Min( size, desBlockSize );
		dim3 gridSize = size / blockSize. x;
		unsigned int shmem = blockSize. x * sizeof( int );
		cout << "Grid size: " << gridSize. x << endl
		<< "Block size: " << blockSize. x << endl
		<< "Shmem: " << shmem << endl;
		tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
		int sizeReduced = gridSize. x;
		while( sizeReduced > cpuThreshold )
		{
		cout << "Reducing with size reduced = " << sizeReduced << endl;
		blockSize. x = :: Min( sizeReduced, desBlockSize );
		gridSize. x = sizeReduced / blockSize. x;
		shmem = blockSize. x * sizeof(int);
		tnlCUDASimpleReductionKernel1< int, tnlSum ><<< gridSize, blockSize, shmem >>>( size, input, output );
		sizeReduced = gridSize. x;
		}
		int* host_output = new int[ sizeReduced ];
		cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
		int result = host_output[ 0 ];
		for( int i = 1;i < sizeReduced; i++ )
		result += host_output[ i ];
		delete[] host_output;
		return result;
		}

		/*
		float tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDASimpleReduction1< float, tnlMin >( size, block_size, grid_size, input );
		}

		float tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDASimpleReduction1< float, tnlMax >( size, block_size, grid_size, input );
		}

		float tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		{
		return tnlCUDASimpleReduction1< float, tnlSum >( size, block_size, grid_size, input );
		}

		double tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDASimpleReduction1< double, tnlMin >( size, block_size, grid_size, input );
		}

		double tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDASimpleReduction1< double, tnlMax >( size, block_size, grid_size, input );
		}

		double tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		{
		return tnlCUDASimpleReduction1< double, tnlSum >( size, block_size, grid_size, input );
		}*/
		No newline at end of file

src/core/tnl-cuda-kernels.h

+87 −0

Original line number	Diff line number	Diff line
		@@ -235,6 +235,93 @@ T tnlCUDAReduction( const int size,
		return result;
		}

		template < class T, tnlOperation operation >
		__global__ void tnlCUDASimpleReductionKernel1( const int size,
		const T* d_input,
		T* d_output )
		{
		extern __shared__ int sdata[];

		// Read data into the shared memory
		int tid = threadIdx. x;
		int gid = blockIdx. x * blockDim. x + threadIdx. x;
		// Last thread ID which manipulates meaningful data
		int last_tid = size - blockIdx. x * blockDim. x;
		if( gid < size )
		sdata[tid] = d_input[gid];
		__syncthreads();

		// Parallel reduction
		for( int s = 1; s < blockDim. x; s *= 2 )
		{
		if( ( tid % ( 2 * s ) ) == 0 && tid + s < last_tid )
		{
		if( operation == tnlMin )
		sdata[ tid ] = tnlCudaMin( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlMax )
		sdata[ tid ] = tnlCudaMax( sdata[ tid ], sdata[ tid + s ] );
		if( operation == tnlSum )
		sdata[ tid ] += sdata[ tid + s ];
		}
		__syncthreads();
		}

		// Store the result back in global memory
		if( tid == 0 )
		d_output[ blockIdx. x ] = sdata[ 0 ];
		}

		template< class T, tnlOperation operation >
		T tnlCUDASimpleReduction1( const int size,
		const int block_size,
		const int grid_size,
		const T* d_input )
		{
		T result;

		dim3 blockSize( block_size );
		dim3 gridSize( grid_size );
		int shmem = 512 * sizeof( T );
		tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
		switch( block_size )
		{
		case 512:
		tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 256:
		tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 128:
		tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 64:
		tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 32:
		tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 16:
		tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 8:
		tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 4:
		tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 2:
		tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 1:
		tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		default:
		tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		break;
		}
		return result;
		}

		#endif /* HAVE_CUDA */

		#endif /* TNLCUDAKERNELS_H_ */

src/core/tnlCUDAKernelsTester.h

+107 −20

Original line number	Diff line number	Diff line
		@@ -67,6 +67,49 @@ double tnlCUDAReductionSum( const int size,
		const int grid_size,
		const double* input );

		/*
		* Simple reduction 1
		*/
		int tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output );
		int tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output );
		int tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const int* input,
		int* output );
		float tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		float tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		float tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const float* input );
		double tnlCUDASimpleReduction1Min( const int size,
		const int block_size,
		const int grid_size,
		const double* input );
		double tnlCUDASimpleReduction1Max( const int size,
		const int block_size,
		const int grid_size,
		const double* input );
		double tnlCUDASimpleReduction1Sum( const int size,
		const int block_size,
		const int grid_size,
		const double* input );

		#endif


		@@ -82,50 +125,94 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase
		{
		CppUnit :: TestSuite* suiteOfTests = new CppUnit :: TestSuite( "tnlCUDAKernelsTester" );
		CppUnit :: TestResult result;
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
		"testSimpleReduction1",
		& tnlCUDAKernelsTester< T > :: testSimpleReduction1 )
		);
		suiteOfTests -> addTest( new CppUnit :: TestCaller< tnlCUDAKernelsTester< T > >(
		"testReduction",
		& tnlCUDAKernelsTester< T > :: testReduction )
		& tnlCUDAKernelsTester< T > :: testFastReduction )
		);

		return suiteOfTests;
		};

		void testReduction()
		bool testSetup( tnlLongVector< T >& host_input,
		tnlLongVector< T >& host_output,
		tnlLongVectorCUDA< T >& device_input,
		tnlLongVectorCUDA< T >& device_output,
		int size )
		{
		/*
		* Test by Jan Vacata.
		*/
		int size = 100;
		int desBlockSize = 128; //Desired block size
		int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work

		tnlLongVector< T > host_input( size );
		tnlLongVector< T > host_output( size );

		tnlLongVectorCUDA< T > device_input( size );
		tnlLongVectorCUDA< T > device_output( size );
		if( ! host_input. SetNewSize( size ) )
		return false;
		if( ! host_output. SetNewSize( size ) )
		return false;
		if( ! device_input. SetNewSize( size ) )
		return false;
		if( ! device_output. SetNewSize( size ) )
		return false;

		for( int i=0; i < size; i ++ )
		{
		host_input[ i ] = 1;
		host_input[ i ] = i + 1;
		host_output[ i ] = 0;
		}
		device_input. copyFrom( host_input );
		return true;
		}

		void testReduction( int algorithm_efficiency = 0 )
		{
		int size = 1<<16;
		int desBlockSize = 128; //Desired block size
		int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work

		tnlLongVector< T > host_input, host_output;
		tnlLongVectorCUDA< T > device_input, device_output;
		CPPUNIT_ASSERT( testSetup( host_input,
		host_output,
		device_input,
		device_output,
		size ) );



		//Calculate necessary block/grid dimensions
		int block_size = :: Min( size/2, desBlockSize );
		//Grid size is limited in this case
		int grid_size = :: Min( desGridSize, size / block_size / 2 );

		T min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
		T max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
		T sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
		T min, max, sum;
		switch( algorithm_efficiency )
		{
		case 1:
		min = tnlCUDASimpleReduction1Min( size, block_size, grid_size, device_input. Data(), device_output. Data() );
		max = tnlCUDASimpleReduction1Max( size, block_size, grid_size, device_input. Data(), device_output. Data() );
		sum = tnlCUDASimpleReduction1Sum( size, block_size, grid_size, device_input. Data(), device_output. Data() );
		break;
		default:
		min = tnlCUDAReductionMin( size, block_size, grid_size, device_input. Data() );
		max = tnlCUDAReductionMax( size, block_size, grid_size, device_input. Data() );
		sum = tnlCUDAReductionSum( size, block_size, grid_size, device_input. Data() );
		}

		cout << "Min: " << min
		<< "Max: " << max
		cout << "Min: " << min << endl
		<< "Max: " << max << endl
		<< "Sum: " << sum << endl;

		};

		void testSimpleReduction1()
		{
		testReduction( 1 );
		};

		void testFastReduction()
		{
		testReduction( 0 );
		}


		};

src/tnl-unit-tests.cpp

+2 −2

Original line number	Diff line number	Diff line
		@@ -44,8 +44,8 @@ int main( int argc, char* argv[] )
		runner.addTest( tnlGridCUDA2DTester< double > :: suite() );

		runner.addTest( tnlCUDAKernelsTester< int > :: suite() );
		runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
		runner.addTest( tnlCUDAKernelsTester< double > :: suite() );
		//runner.addTest( tnlCUDAKernelsTester< float > :: suite() );
		//runner.addTest( tnlCUDAKernelsTester< double > :: suite() );

		runner.run();
		return 0;