Implementing CUDA parallel reduction. (a37a9195) · Commits · TNL / tnl-dev

configure.ac

+1 −1

Original line number	Diff line number	Diff line
		@@ -44,7 +44,7 @@ then
		STORE_CPPFLAGS=$CPPFLAGS
		STORE_LDFLAGS=$LDFLAGS
		CUDA_LDFLAGS="-L$CUDA_LIBS"
		CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA -arch sm_13"
		CUDA_CXXFLAGS="-I$CUDA_HEADERS -DHAVE_CUDA"
		CXXFLAGS="$CXXFLAGS -I$CUDA_HEADERS"
		CPPFLAGS="$CPPFLAGS -I$CUDA_HEADERS"
		LDFLAGS="$LDFLAGS $CUDA_LDFLAGS"

src/Makefile.am

+2 −0

Original line number	Diff line number	Diff line
		@@ -61,4 +61,6 @@ tnl_unit_tests_dbg_LDADD = libtnl-dbg-0.1.la \
		core/libcore-tests-dbg.la
		endif



		TESTS = tnl-unit-tests
		No newline at end of file

src/core/tnl-cuda-kernels.cu

+1 −27

Original line number	Diff line number	Diff line
		@@ -104,33 +104,7 @@ int tnlCUDASimpleReduction1Min( const int size,
		const int* input,
		int* output )
		{
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 128; //Desired block size
		dim3 blockSize = :: Min( size, desBlockSize );
		dim3 gridSize = size / blockSize. x;
		unsigned int shmem = blockSize. x * sizeof( int );
		cout << "Grid size: " << gridSize. x << endl
		<< "Block size: " << blockSize. x << endl
		<< "Shmem: " << shmem << endl;
		tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
		int sizeReduced = gridSize. x;
		while( sizeReduced > cpuThreshold )
		{
		cout << "Reducing with size reduced = " << sizeReduced << endl;
		blockSize. x = :: Min( sizeReduced, desBlockSize );
		gridSize. x = sizeReduced / blockSize. x;
		shmem = blockSize. x * sizeof(int);
		tnlCUDASimpleReductionKernel1< int, tnlMin ><<< gridSize, blockSize, shmem >>>( size, input, output );
		sizeReduced = gridSize. x;
		}
		int* host_output = new int[ sizeReduced ];
		cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
		int result = host_output[ 0 ];
		for( int i = 1;i < sizeReduced; i++ )
		result = :: Min( result, host_output[ i ] );
		delete[] host_output;
		return result;

		}

		int tnlCUDASimpleReduction1Max( const int size,

src/core/tnl-cuda-kernels.h

+41 −45

Original line number	Diff line number	Diff line
		@@ -273,52 +273,48 @@ __global__ void tnlCUDASimpleReductionKernel1( const int size,

		template< class T, tnlOperation operation >
		T tnlCUDASimpleReduction1( const int size,
		const int block_size,
		const int grid_size,
		const T* d_input )
		const T* d_input,
		T* output = 0 )
		{
		T result;

		dim3 blockSize( block_size );
		dim3 gridSize( grid_size );
		int shmem = 512 * sizeof( T );
		tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
		switch( block_size )
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 128; //Desired block size
		dim3 block_size = :: Min( size, desBlockSize );
		dim3 grid_size = size / blockSize. x;
		unsigned int shmem = blockSize. x * sizeof( int );
		cout << "Grid size: " << grid_size. x << endl
		<< "Block size: " << block_size. x << endl
		<< "Shmem: " << shmem << endl;
		if( ! output )
		{
		case 512:
		tnlCUDASimpleReductionKernel1< T, operation, 512 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 256:
		tnlCUDASimpleReductionKernel1< T, operation, 256 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 128:
		tnlCUDASimpleReductionKernel1< T, operation, 128 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 64:
		tnlCUDASimpleReductionKernel1< T, operation, 64 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 32:
		tnlCUDASimpleReductionKernel1< T, operation, 32 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 16:
		tnlCUDASimpleReductionKernel1< T, operation, 16 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 8:
		tnlCUDASimpleReductionKernel1< T, operation, 8 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 4:
		tnlCUDAReductionKernel< T, operation, 4 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 2:
		tnlCUDAReductionKernel< T, operation, 2 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		case 1:
		tnlCUDAReductionKernel< T, operation, 1 ><<< gridSize, blockSize, shmem >>>( size, d_input, &result );
		break;
		default:
		tnlAssert( false, cerr << "Block size is " << block_size << " which is none of 1, 2, 4, 8, 16, 32, 64, 128, 256 or 512." );
		break;
		cudaMalloc( ( void** ) &output, grid_size * sizeof( T ) );
		if( cuda // TODO: add allocation check
		}
		tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
		int size_reduced = grid_size. x;
		while( sizeReduced > cpuThreshold )
		{
		cout << "Reducing with size reduced = " << size_reduced << endl;
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = size_reduced / block_size. x;
		shmem = block_size. x * sizeof(int);
		tnlCUDASimpleReductionKernel1< T, operation ><<< grid_size, block_size, shmem >>>( size, input, output );
		size_reduced = grid_size. x;
		}
		int* host_output = new int[ size_reduced ];
		cudaMemcpy( host_output, output, sizeReduced * sizeof(int), cudaMemcpyDeviceToHost );
		int result = host_output[ 0 ];
		for( int i = 1;i < size_reduced; i++ )
		{
		if( operation == tnlMin)
		result = :: Min( result, host_output[ i ] );
		if( operation == tnlMax )
		result = :: Max( result, host_output[ i ] );
		if( operation == tnlSum )
		result += host_ouput[ i ];
		}
		delete[] host_output;

		return result;
		}

src/core/tnlLongVectorCUDATester.cu.h

+10 −8

Original line number	Diff line number	Diff line
		@@ -15,6 +15,15 @@ __global__ void setMultiBlockNumber( const T c, T* A, const int size )
		if( i < size ) A[ i ] = c;
		};

		template< class T >
		__global__ void setNumber( const T c, T* A, const int size )
		{
		int i = threadIdx. x;
		if( i < size )
		A[ i ] = c;
		};


		template< class T >
		void testMultiBlockKernel( const T& number, const int size )
		{
		@@ -37,20 +46,13 @@ void testMultiBlockKernel( const T& number, const int size )
		CPPUNIT_ASSERT( ! errors );
		};

		template< class T >
		__global__ void setNumber( T* A, const T c )
		{
		int i = threadIdx. x;
		A[ i ] = c;
		};

		template< class T >
		void testKernel( const T& number, const int size )
		{
		tnlLongVectorCUDA< T > device_vector( size );
		tnlLongVector< T > host_vector( size );
		T* data = device_vector. Data();
		setNumber<<< 1, size >>>( data, number );
		setNumber<<< 1, size >>>( number, data, size );
		host_vector. copyFrom( device_vector );

		int errors( 0 );