From c60aa9e42a4afdb71e317c753dfac73ddc19858c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Oberhuber?= <oberhuber.tomas@gmail.com> Date: Thu, 11 Apr 2019 21:53:36 +0200 Subject: [PATCH] Rewritting parallel reduciton with lambda functions. --- .../Algorithms/CudaReductionKernel.h | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index a5823849d4..08a767c601 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -39,14 +39,19 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif -template< int blockSize, typename Operation, typename Index > +template< int blockSize, + typename Real, + typename FirstPhase, + typename SecondPhase, + typename Index, + typename ResultType = decltype( std::declval< FirstPhase >( 0,0 ) ) > __global__ void __launch_bounds__( Reduction_maxThreadsPerBlock, Reduction_minBlocksPerMultiprocessor ) -CudaReductionKernel( Operation operation, +CudaReductionKernel( const Real& initialValue, + FirstReduction& firstReduction, + SecondReduction& secondReduction, const Index size, - const typename Operation::DataType1* input1, - const typename Operation::DataType2* input2, - typename Operation::ResultType* output ) + ResultType* output ) { typedef Index IndexType; typedef typename Operation::ResultType ResultType; @@ -62,23 +67,23 @@ CudaReductionKernel( Operation operation, IndexType gid = blockIdx.x * blockDim. x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - sdata[ tid ] = operation.initialValue(); + sdata[ tid ] = initialValue; /*** * Read data into the shared memory. We start with the * sequential reduction. */ while( gid + 4 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 2 * gridSize, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + 3 * gridSize, input1, input2 ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + gridSize ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + 2 * gridSize ); + sdata[ tid ] = firstReduction( sdata[ tid ], gid + 3 * gridSize ); gid += 4 * gridSize; } while( gid + 2 * gridSize < size ) { - operation.firstReduction( sdata[ tid ], gid, input1, input2 ); - operation.firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); + firstReduction( sdata[ tid ], gid, input1, input2 ); + firstReduction( sdata[ tid ], gid + gridSize, input1, input2 ); gid += 2 * gridSize; } while( gid < size ) -- GitLab