Optimizing parallel reduction in CUDA. (398d4b1a) · Commits · TNL / tnl-dev

src/core/cuda/CMakeLists.txt

+2 −0

Original line number	Diff line number	Diff line
		@@ -3,6 +3,8 @@ set( headers cuda-prefix-sum.h
		cuda-reduction.h
		cuda-reduction_impl.h
		reduction-operations.h
		tnlCudaReduction.h
		tnlCudaReduction_impl.h
		tnlCublasWrapper.h )

		SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/core/cuda )

src/core/cuda/cuda-prefix-sum_impl.h

+3 −2

Original line number	Diff line number	Diff line
		@@ -163,13 +163,14 @@ __global__ void cudaSecondPhaseBlockPrefixSum( const Operation operation,
		{
		if( blockIdx. x > 0 )
		{
		const DataType shift = operation.commonReductionOnDevice( gridShift, auxArray[ blockIdx. x - 1 ] );
		DataType shift( gridShift );
		operation.commonReductionOnDevice( shift, auxArray[ blockIdx. x - 1 ] );

		const Index readOffset = blockIdx. x * elementsInBlock;
		Index readIdx = threadIdx. x;
		while( readIdx < elementsInBlock && readOffset + readIdx < size )
		{
		operation.performInPlace( data[ readIdx + readOffset ], shift );
		operation.commonReductionOnDevice( data[ readIdx + readOffset ], shift );
		readIdx += blockDim. x;
		}
		}

src/core/cuda/cuda-reduction_impl.h

+19 −181

Original line number	Diff line number	Diff line
		@@ -29,6 +29,7 @@
		#include <core/arrays/tnlArrayOperations.h>
		#include <core/mfuncs.h>
		#include <core/cuda/tnlCudaReductionBuffer.h>
		#include <core/cuda/tnlCudaReduction.h>

		#ifdef CUDA_REDUCTION_PROFILING
		#include <core/tnlTimerRT.h>
		@@ -42,184 +43,23 @@ using namespace std;
		* are reduced on CPU. The constant must not be larger
		* than maximal CUDA grid size.
		*/
		const int minGPUReductionDataSize = 256; //2048;//65536; //16384;//1024;//256;
		const int minGPUReductionDataSize = 256;//65536; //16384;//1024;//256;

		static tnlCudaReductionBuffer cudaReductionBuffer( 8 * minGPUReductionDataSize );

		#ifdef HAVE_CUDA

		/***
		* The parallel reduction of one vector.
		*
		* WARNING: This kernel only reduce data in one block. Use rather tnlCUDASimpleReduction2
		* to call this kernel then doing it by yourself.
		* This kernel is very inefficient. It is here only for educative and testing reasons.
		* Please use tnlCUDAReduction instead.
		*
		* The kernel parameters:
		* @param size is the number of all element to reduce - not just in one block.
		* @param deviceInput input data which we want to reduce
		* @param deviceOutput an array to which we write the result of reduction.
		* Each block of the grid writes one element in this array
		* (i.e. the size of this array equals the number of CUDA blocks).
		*/

		template< typename Operation, int blockSize, bool isSizePow2 >
		__global__ void tnlCUDAReductionKernel( const Operation operation,
		const typename Operation :: IndexType size,
		const typename Operation :: RealType* deviceInput,
		const typename Operation :: RealType* deviceInput2,
		typename Operation :: ResultType* deviceOutput )
		{
		extern __shared__ __align__ ( 8 ) char __sdata[];

		typedef typename Operation :: IndexType IndexType;
		typedef typename Operation :: RealType RealType;
		typedef typename Operation :: ResultType ResultType;

		ResultType* sdata = reinterpret_cast< ResultType* >( __sdata );

		/***
		* Get thread id (tid) and global thread id (gid).
		* gridSize is the number of element processed by all blocks at the
		* same time.
		*/
		IndexType tid = threadIdx. x;
		IndexType gid = blockIdx. x * blockDim. x + threadIdx. x;
		IndexType gridSize = blockDim. x * gridDim.x;

		sdata[ tid ] = operation.initialValue();
		/***
		* Read data into the shared memory. We start with the
		* sequential reduction.
		*/
		while( gid + 4 * gridSize < size )
		{
		operation.firstReduction( sdata[ tid ], gid, deviceInput, deviceInput2 );
		operation.firstReduction( sdata[ tid ], gid + gridSize, deviceInput, deviceInput2 );
		operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, deviceInput, deviceInput2 );
		operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, deviceInput, deviceInput2 );
		//sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
		//sdata[ tid ] += deviceInput[ gid + gridSize ] * deviceInput[ gid + gridSize ];
		//sdata[ tid ] += deviceInput[ gid + 2 * gridSize ] * deviceInput[ gid + 2 * gridSize ];
		//sdata[ tid ] += deviceInput[ gid + 3 * gridSize ] * deviceInput[ gid + 3 * gridSize ];
		gid += 4*gridSize;
		}
		while( gid + 2 * gridSize < size )
		{
		operation.firstReduction( sdata[ tid ], gid, deviceInput, deviceInput2 );
		operation.firstReduction( sdata[ tid ], gid + gridSize, deviceInput, deviceInput2 );

		//sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
		//sdata[ tid ] += deviceInput[ gid + gridSize ] * deviceInput[ gid + gridSize ];
		gid += 2*gridSize;
		}
		while( gid < size )
		{
		operation.firstReduction( sdata[ tid ], gid, deviceInput, deviceInput2 );
		//sdata[ tid ] += deviceInput[ gid ] * deviceInput[ gid ];
		gid += gridSize;
		}
		__syncthreads();


		//printf( "1: tid %d data %f \n", tid, sdata[ tid ] );

		//return;
		/***
		* Perform the parallel reduction.
		*/
		if( blockSize >= 1024 )
		{
		if( tid < 512 )
		//sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 512 ] );
		sdata[ tid ] += sdata[ tid + 512 ];
		__syncthreads();
		}
		if( blockSize >= 512 )
		{
		if( tid < 256 )
		//sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 256 ] );
		sdata[ tid ] += sdata[ tid + 256 ];
		__syncthreads();
		}
		if( blockSize >= 256 )
		{
		if( tid < 128 )
		//sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 128 ] );
		sdata[ tid ] += sdata[ tid + 128 ];
		__syncthreads();
		//printf( "2: tid %d data %f \n", tid, sdata[ tid ] );
		}

		if( blockSize >= 128 )
		{
		if( tid < 64 )
		//sdata[ tid ] = operation.commonReductionOnDevice( sdata[ tid ], sdata[ tid + 64 ] );
		sdata[ tid ] += sdata[ tid + 64 ];
		__syncthreads();
		//printf( "3: tid %d data %f \n", tid, sdata[ tid ] );
		}


		/***
		* This runs in one warp so it is synchronized implicitly.
		*/
		if( tid < 32 )
		{
		volatile ResultType* vsdata = sdata;
		if( blockSize >= 64 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 32 ] );
		vsdata[ tid ] += vsdata[ tid + 32 ];
		//__syncthreads();
		//printf( "4: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 32 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 16 ] );
		vsdata[ tid ] += vsdata[ tid + 16 ];
		//__syncthreads();
		//printf( "5: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 16 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 8 ] );
		vsdata[ tid ] += vsdata[ tid + 8 ];
		//__syncthreads();
		//printf( "6: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 8 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 4 ] );
		vsdata[ tid ] += vsdata[ tid + 4 ];
		//__syncthreads();
		//printf( "7: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 4 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 2 ] );
		vsdata[ tid ] += vsdata[ tid + 2 ];
		//__syncthreads();
		//printf( "8: tid %d data %f \n", tid, sdata[ tid ] );
		}
		if( blockSize >= 2 )
		{
		//vsdata[ tid ] = operation.commonReductionOnDevice( vsdata[ tid ], vsdata[ tid + 1 ] );
		vsdata[ tid ] += vsdata[ tid + 1 ];
		//__syncthreads();
		//printf( "9: tid %d data %f \n", tid, sdata[ tid ] );
		}
		}

		/***
		* Store the result back in the global memory.
		*/
		if( tid == 0 )
		const typename Operation :: RealType* input1,
		const typename Operation :: RealType* input2,
		typename Operation :: ResultType* output )
		{
		//printf( "Block %d result = %f \n", blockIdx.x, sdata[ 0 ] );
		deviceOutput[ blockIdx. x ] = sdata[ 0 ];
		}
		}
		typedef tnlCUDAReduction< Operation, blockSize, isSizePow2 > Reduction;
		Reduction::reduce( operation, size, input1, input2, output );
		};

		template< typename Operation >
		typename Operation::IndexType reduceOnCudaDevice( const Operation& operation,
		@@ -430,8 +270,6 @@ bool reductionOnCudaDevice( const Operation& operation,
		timer.reset();
		timer.start();
		#endif
		//for( IndexType i = 0; i < reducedSize; i ++ )
		// cout << resultArray[ i ] << ", ";
		result = laterReductionOperation. initialValueOnHost( 0, resultArray, ( ResultType* ) 0 );
		for( IndexType i = 1; i < reducedSize; i ++ )
		result = laterReductionOperation. reduceOnHost( i, result, resultArray, ( ResultType*) 0 );

src/core/cuda/reduction-operations.h

+182 −173

File changed.

Preview size limit exceeded, changes collapsed.

src/core/cuda/tnlCudaReduction.h

0 → 100644

+62 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		tnlCudaReduction.h - description
		-------------------
		begin : Jun 17, 2015
		copyright : (C) 2015 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/***************************************************************************
		* *
		* This program is free software; you can redistribute it and/or modify *
		* it under the terms of the GNU General Public License as published by *
		* the Free Software Foundation; either version 2 of the License, or *
		* (at your option) any later version. *
		* *
		***************************************************************************/

		#ifndef TNLCUDAREDUCTION_H
		#define TNLCUDAREDUCTION_H

		#ifdef HAVE_CUDA

		template< typename Operation, int blockSize, bool isSizePow2 >
		class tnlCUDAReduction
		{
		public:

		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;


		__device__ static void reduce( const Operation operation,
		const IndexType size,
		const RealType* input1,
		const RealType* input2,
		ResultType* output );
		};

		/*template< typename Real, typename Index, int blockSize, bool isSizePow2 >
		class tnlCUDAReduction< tnlParallelReductionScalarProduct< Real, Index >, blockSize, isSizePow2 >
		{
		public:

		typedef tnlParallelReductionScalarProduct< Real, Index > Operation;
		typedef typename Operation::IndexType IndexType;
		typedef typename Operation::RealType RealType;
		typedef typename Operation::ResultType ResultType;

		__device__ static void reduce( const Operation operation,
		const IndexType size,
		const RealType* input1,
		const RealType* input2,
		ResultType* output );
		};*/

		#include <core/cuda/tnlCudaReduction_impl.h>

		#endif

		#endif /* TNLCUDAREDUCTION_H */