Implementing CUDA parallel reduction. (3c8e9889) · Commits · TNL / tnl-dev

src/core/tnl-cuda-kernels.h

+19 −20

Original line number	Diff line number	Diff line
		@@ -60,29 +60,28 @@ __global__ void tnlCUDAReductionKernel( const int size,

		// Read data into the shared memory
		int tid = threadIdx. x;
		int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
		// Last thread ID which manipulates meaningful data
		int gid = 2 * blockSize * blockDim. x + threadIdx. x;

		//int grid_size = 2 * blockSize * gridDim. x;
		if( gid + blockSize < size )
		{
		if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
		}
		else
		else if( gid < size )
		{
		sdata[ tid ] = d_input[ gid ];
		}
		gid += grid_size;

		while( gid < size )
		while( gid + blockSize < size )
		{
		if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] ) );
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] ) );
		if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
		gid += grid_size;
		}
		dbg_array1[ blockIdx. x * blockDim. x + threadIdx. x ] = sdata[ tid ];
		__syncthreads();

		if( gid + blockDim. x < size )
		@@ -236,7 +235,7 @@ bool tnlCUDAReduction( const int size,
		}
		device_output_allocated = true;

		cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		cudaMalloc( ( void** ) &dbg_array1, size * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		@@ -257,31 +256,31 @@ bool tnlCUDAReduction( const int size,
		switch( block_size. x )
		{
		case 512:
		tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 256:
		tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 128:
		tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 64:
		tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 32:
		tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 16:
		tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 8:
		tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 4:
		tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 2:
		tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output, dbg_array1 );
		tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size, block_size, shmem >>>( size_reduced, grid_size. x * block_size. x, reduction_input, device_output, dbg_array1 );
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );
		@@ -294,9 +293,9 @@ bool tnlCUDAReduction( const int size,
		reduction_input = device_output;

		// debuging part
		T* host_array = new T[ desBlockSize ];
		cudaMemcpy( host_array, dbg_array1, desBlockSize * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< :: Min( ( int ) block_size. x, desBlockSize ); i ++ )
		T* host_array = new T[ size ];
		cudaMemcpy( host_array, dbg_array1, size * sizeof( T ), cudaMemcpyDeviceToHost );
		for( int i = 0; i< size; i ++ )
		cout << host_array[ i ] << " - ";
		cout << endl;

src/core/tnlCUDAKernelsTester.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -100,7 +100,7 @@ template< class T > class tnlCUDAKernelsTester : public CppUnit :: TestCase

		void testReduction( int algorithm_efficiency = 0 )
		{
		int size = 1024; //1<<10;
		int size = 32; 1024; //1<<10;
		int desBlockSize = 128; //Desired block size
		int desGridSize = 2048; //Impose limitation on grid size so that threads could perform sequential work