Optimized upper bound for the scan of warpResults in the CUDA parallel scan (addb7566) · Commits · TNL / tnl-dev

src/TNL/Algorithms/detail/CudaScanKernel.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -116,7 +116,7 @@ CudaScanKernelFirstPhase( const InputView input,
		// Perform the scan of warpResults using one warp.
		if( warpIdx == 0 )
		#pragma unroll
		for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
		for( int stride = 1; stride < blockSize / Cuda::getWarpSize(); stride *= 2 ) {
		if( threadInWarpIdx >= stride )
		warpResults[ threadIdx.x ] = reduction( warpResults[ threadIdx.x ], warpResults[ threadIdx.x - stride ] );
		__syncwarp();