This project is archived. Its data is read-only. This project is read-only.

Commit 87a42926 authored Jun 17, 2015 by Tomáš Oberhuber

Optimizing parallel reduction in CUDA.

parent 3aa06ead

src/core/CMakeLists.txt

+4 −3

Original line number	Diff line number	Diff line
		@@ -4,6 +4,7 @@ ADD_SUBDIRECTORY( cuda )
		ADD_SUBDIRECTORY( vectors )

		set (headers tnlAssert.h
		tnlConstants.h
		tnlCurve.h
		tnlCuda.h
		tnlDataElement.h

src/core/cuda/cuda-prefix-sum_impl.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -53,7 +53,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		if( prefixSumType == exclusivePrefixSum )
		{
		if( idx == 0 )
		sharedData[ 0 ] = operation.identity();
		sharedData[ 0 ] = operation.initialValue();
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		sharedData[ tnlCuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
		@@ -129,7 +129,7 @@ __global__ void cudaFirstPhaseBlockPrefixSum( const enumPrefixSumType prefixSumT
		while( idx < elementsInBlock && blockOffset + idx < size )
		{
		const Index chunkIdx = idx / chunkSize;
		DataType chunkShift( operation.identity() );
		DataType chunkShift( operation.initialValue() );
		if( chunkIdx > 0 )
		chunkShift = auxData[ chunkIdx - 1 ];
		operation.performInPlace( sharedData[ tnlCuda::getInterleaving( idx ) ], chunkShift );

src/core/cuda/cuda-reduction_impl.h

+261 −254

File changed.

Preview size limit exceeded, changes collapsed.

src/core/cuda/reduction-operations.h

+448 −4

File changed.

Preview size limit exceeded, changes collapsed.

src/core/mfuncs.h

+11 −0

Original line number	Diff line number	Diff line
		@@ -96,6 +96,17 @@ inline int roundToMultiple( int number, int multiple )
		return multiple*( number/ multiple + ( number % multiple != 0 ) );
		}

		__cuda_callable__
		inline bool isPow2( int x )
		{
		return ( x & ( x - 1 ) == 0 );
		}

		__cuda_callable__
		inline bool isPow2( long int x )
		{
		return ( x & ( x - 1 ) == 0 );
		}

		/*template< typename T >
		void swap( T& a, T& b)