Implementing CUDA parallel reduction. (0076d54f) · Commits · TNL / tnl-dev

src/core/Makefile.am

+1 −0

Original line number	Diff line number	Diff line
		@@ -38,6 +38,7 @@ headers = tnlAssert.h \
		tnlTester.h \
		tnlVector.h \
		tnl-cuda-kernels.h \
		tnl-cuda-kernels.cu.h \
		tnlCUDAKernelsTester.h \
		compress-file.h \
		mfilename.h \

src/core/tnl-cuda-kernels.cu

+45 −47

Original line number	Diff line number	Diff line
		@@ -22,76 +22,74 @@
		using namespace std;

		int tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		const int* input,
		int& result,
		int* device_aux_array = 0 )
		{
		return tnlCUDAReduction< int, tnlMin >( size, block_size, grid_size, input );
		return tnlCUDAReduction< int, tnlMin >( size, input, result, device_aux_array );
		}

		int tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		const int* input,
		int& result,
		int* device_aux_array = 0 )
		{
		return tnlCUDAReduction< int, tnlMax >( size, block_size, grid_size, input );
		return tnlCUDAReduction< int, tnlMax >( size, input, result, device_aux_array );
		}

		int tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const int* input )
		const int* input,
		int& result,
		int* device_aux_array = 0 )
		{
		return tnlCUDAReduction< int, tnlSum >( size, block_size, grid_size, input );
		return tnlCUDAReduction< int, tnlSum >( size, input, result, device_aux_array );
		}


		float tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		bool tnlCUDAReductionMin( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 )
		{
		return tnlCUDAReduction< float, tnlMin >( size, block_size, grid_size, input );
		return tnlCUDAReduction< float, tnlMin >( size, input, result, device_aux_array );
		}

		float tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		bool tnlCUDAReductionMax( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 )
		{
		return tnlCUDAReduction< float, tnlMax >( size, block_size, grid_size, input );
		return tnlCUDAReduction< float, tnlMax >( size, input, result, device_aux_array );
		}

		float tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const float* input )
		bool tnlCUDAReductionSum( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 )
		{
		return tnlCUDAReduction< float, tnlSum >( size, block_size, grid_size, input );
		return tnlCUDAReduction< float, tnlSum >( size, input, result, device_aux_array );
		}

		double tnlCUDAReductionMin( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		bool tnlCUDAReductionMin( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 )
		{
		return tnlCUDAReduction< double, tnlMin >( size, block_size, grid_size, input );
		return tnlCUDAReduction< double, tnlMin >( size, input, result, device_aux_array );
		}

		double tnlCUDAReductionMax( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		bool tnlCUDAReductionMax( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 )
		{
		return tnlCUDAReduction< double, tnlMax >( size, block_size, grid_size, input );
		return tnlCUDAReduction< double, tnlMax >( size, input, result, device_aux_array );
		}

		double tnlCUDAReductionSum( const int size,
		const int block_size,
		const int grid_size,
		const double* input )
		bool tnlCUDAReductionSum( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 )
		{
		return tnlCUDAReduction< double, tnlSum >( size, block_size, grid_size, input );
		return tnlCUDAReduction< double, tnlSum >( size, input, result, device_aux_array );
		}

		/*

src/core/tnl-cuda-kernels.cu.h

0 → 100644

+222 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnl-cuda-kernels.cu.h - description
		-------------------
		begin : Jan 19, 2010
		copyright : (C) 2010 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef TNLCUDAKERNELS_CU_H_
		#define TNLCUDAKERNELS_CU_H_

		int tnlCUDAReductionMin( const int size,
		const int* input,
		int& result,
		int* device_aux_array = 0 );

		int tnlCUDAReductionMax( const int size,
		const int* input,
		int& result,
		int* device_aux_array = 0 );

		int tnlCUDAReductionSum( const int size,
		const int* input,
		int& result,
		int* device_aux_array = 0 );

		bool tnlCUDAReductionMin( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 );

		bool tnlCUDAReductionMax( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 );

		bool tnlCUDAReductionSum( const int size,
		const float* input,
		float& result,
		float* device_aux_array = 0 );

		bool tnlCUDAReductionMin( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 );

		bool tnlCUDAReductionMax( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 );

		bool tnlCUDAReductionSum( const int size,
		const double* input,
		double& result,
		double* device_aux_array = 0 );

		/*
		* Simple reduction 5
		*/
		bool tnlCUDASimpleReduction5Min( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction5Max( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction5Sum( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction5Min( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction5Max( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction5Sum( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction5Min( const int size,
		const double* input,
		double& result);
		bool tnlCUDASimpleReduction5Max( const int size,
		const double* input,
		double& result );
		bool tnlCUDASimpleReduction5Sum( const int size,
		const double* input,
		double& result );

		/*
		* Simple reduction 4
		*/
		bool tnlCUDASimpleReduction4Min( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction4Max( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction4Sum( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction4Min( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction4Max( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction4Sum( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction4Min( const int size,
		const double* input,
		double& result);
		bool tnlCUDASimpleReduction4Max( const int size,
		const double* input,
		double& result );
		bool tnlCUDASimpleReduction4Sum( const int size,
		const double* input,
		double& result );

		/*
		* Simple reduction 3
		*/
		bool tnlCUDASimpleReduction3Min( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction3Max( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction3Sum( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction3Min( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction3Max( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction3Sum( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction3Min( const int size,
		const double* input,
		double& result);
		bool tnlCUDASimpleReduction3Max( const int size,
		const double* input,
		double& result );
		bool tnlCUDASimpleReduction3Sum( const int size,
		const double* input,
		double& result );

		/*
		* Simple reduction 2
		*/
		bool tnlCUDASimpleReduction2Min( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction2Max( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction2Sum( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction2Min( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction2Max( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction2Sum( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction2Min( const int size,
		const double* input,
		double& result);
		bool tnlCUDASimpleReduction2Max( const int size,
		const double* input,
		double& result );
		bool tnlCUDASimpleReduction2Sum( const int size,
		const double* input,
		double& result );

		/*
		* Simple reduction 1
		*/
		bool tnlCUDASimpleReduction1Min( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction1Max( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction1Sum( const int size,
		const int* input,
		int& result );
		bool tnlCUDASimpleReduction1Min( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction1Max( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction1Sum( const int size,
		const float* input,
		float& result);
		bool tnlCUDASimpleReduction1Min( const int size,
		const double* input,
		double& result);
		bool tnlCUDASimpleReduction1Max( const int size,
		const double* input,
		double& result );
		bool tnlCUDASimpleReduction1Sum( const int size,
		const double* input,
		double& result );


		#endif /* TNLCUDAKERNELS_CU_H_ */

src/core/tnl-cuda-kernels.h

+164 −169

Original line number	Diff line number	Diff line
		@@ -56,18 +56,18 @@ __global__ void tnlCUDAReductionKernel( const int size,
		T* d_output,
		T* dbg_array1 = 0 )
		{
		extern __shared__ T sdata[];
		extern __shared__ __align__ ( 8 ) T sdata[];

		// Read data into the shared memory
		int tid = threadIdx. x;
		int gid = 2 * blockIdx. x * blockDim. x + threadIdx. x;
		// Last thread ID which manipulates meaningful data

		int grid_size = 2 * blockSize * gridDim. x;
		//int grid_size = 2 * blockSize * gridDim. x;
		if( gid + blockSize < size )
		{
		if( operation == tnlMin ) sdata[ tid ] = :: Min( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMax ) sdata[ tid ] = :: Max( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( d_input[ gid ], d_input[ gid + blockSize ] );
		if( operation == tnlSum ) sdata[ tid ] = d_input[ gid ] + d_input[ gid + blockSize ];
		}
		else
		@@ -78,10 +78,10 @@ __global__ void tnlCUDAReductionKernel( const int size,

		while( gid < size )
		{
		if( operation == tnlMin ) sdata[ tid ] = :: Min( sdata[ tid ], :: Min( d_input[gid], d_input[gid+blockSize] );
		if( operation == tnlMax ) sdata[ tid ] = :: Max( sdata[ tid ], :: Max( d_input[gid], d_input[gid+blockSize] );
		if( operation == tnlMin ) sdata[ tid ] = :: tnlCudaMin( sdata[ tid ], :: tnlCudaMin( d_input[gid], d_input[gid+blockSize] ) );
		if( operation == tnlMax ) sdata[ tid ] = :: tnlCudaMax( sdata[ tid ], :: tnlCudaMax( d_input[gid], d_input[gid+blockSize] ) );
		if( operation == tnlSum ) sdata[ tid ] += d_input[gid] + d_input[ gid + blockSize ];
		gid += gridSize;
		gid += grid_size;
		}
		__syncthreads();

		@@ -220,8 +220,7 @@ bool tnlCUDAReduction( const int size,
		//Calculate necessary block/grid dimensions
		const int cpuThreshold = 1;
		const int desBlockSize = 16; //Desired block size

		T* dbg_array1;
		const int desGridSize = 2048;

		bool device_output_allocated( false );
		if( ! device_output )
		@@ -234,9 +233,6 @@ bool tnlCUDAReduction( const int size,
		return false;
		}
		device_output_allocated = true;

		//cudaMalloc( ( void** ) &dbg_array1, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!
		//cudaMalloc( ( void** ) &dbg_array2, desBlockSize * sizeof( T ) ); //!!!!!!!!!!!!!!!!!!!!!!!!!
		}
		dim3 block_size( 0 ), grid_size( 0 );
		int shmem;
		@@ -245,42 +241,41 @@ bool tnlCUDAReduction( const int size,
		while( size_reduced > cpuThreshold )
		{
		block_size. x = :: Min( size_reduced, desBlockSize );
		grid_size. x = :: Min( ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
		grid_size. x = :: Min( ( int ) ( size_reduced / block_size. x + 1 ) / 2, desGridSize );
		shmem = block_size. x * sizeof( T );
		cout << "Size: " << size_reduced
		<< " Grid size: " << grid_size. x
		<< " Block size: " << block_size. x
		<< " Shmem: " << shmem << endl;
		//tnlCUDASimpleReductionKernel4< T, operation ><<< grid_size. x, block_size, shmem >>>( size_reduced, reduction_input, device_output );
		tnlAssert( shmem < 16384, cerr << shmem << " bytes are required." );
		switch( block_size. x )
		{
		case 512:
		tnlCUDASimpleReductionKernel5< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 512 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 256:
		tnlCUDASimpleReductionKernel5< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 256 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 128:
		tnlCUDASimpleReductionKernel5< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 128 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 64:
		tnlCUDASimpleReductionKernel5< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 64 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 32:
		tnlCUDASimpleReductionKernel5< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 32 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 16:
		tnlCUDASimpleReductionKernel5< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 16 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 8:
		tnlCUDASimpleReductionKernel5< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 8 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 4:
		tnlCUDASimpleReductionKernel5< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 4 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 2:
		tnlCUDASimpleReductionKernel5< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		tnlCUDAReductionKernel< T, operation, 2 ><<< grid_size. x, block_size, shmem >>>( size_reduced, grid_size. x, reduction_input, device_output );
		break;
		case 1:
		tnlAssert( false, cerr << "blockSize should not be 1." << endl );

src/core/tnlCUDAKernelsTester.h

+15 −200

File changed.

Preview size limit exceeded, changes collapsed.