This project is archived. Its data is read-only.

Commit 639df716 authored Nov 09, 2016 by Jakub Klinkovský

Fixed bug in parallel reduction on CUDA

parent ca8e0b12

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -39,7 +39,7 @@ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a powe
		template< typename Operation, int blockSizeX >
		__global__ void
		__launch_bounds__( Multireduction_maxThreadsPerBlock, Multireduction_minBlocksPerMultiprocessor )
		CudaMultireductionKernel( Operation& operation,
		CudaMultireductionKernel( Operation operation,
		const typename Operation::IndexType n,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -39,7 +39,7 @@ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of
		template< typename Operation, int blockSize >
		__global__ void
		__launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor )
		CudaReductionKernel( Operation& operation,
		CudaReductionKernel( Operation operation,
		const typename Operation::IndexType size,
		const typename Operation::RealType* input1,
		const typename Operation::RealType* input2,

src/TNL/Containers/Algorithms/Reduction_impl.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -98,7 +98,7 @@ reductionOnCudaDevice( Operation& operation,
		/***
		* Transfer the reduced data from device to host.
		*/
		ResultType resultArray[ minGPUReductionDataSize ];
		ResultType resultArray[ reducedSize ];
		if( ! Containers::ArrayOperations< Devices::Host, Devices::Cuda >::copyMemory< ResultType, ResultType, IndexType >( resultArray, deviceAux1, reducedSize ) )
		return false;