diff --git a/quicksort/reduction.cuh b/quicksort/reduction.cuh index 3efb309b8ae71d1a937c33e5c5f6028a484ea223..48d3ed32dc3e41943fe3abe4e54478fd6d5d29bd 100644 --- a/quicksort/reduction.cuh +++ b/quicksort/reduction.cuh @@ -34,4 +34,38 @@ __device__ int blockReduceSum(int val) __syncthreads(); return shared[0]; +} + +__device__ int warpPrefixSum(int value) +{ + int laneId = threadIdx.x & 0x1f; + for (int i = 1; i*2 <= warpSize; i *= 2) + { + int n = __shfl_up_sync(0xffffffff, value, i); + if ((laneId & (warpSize - 1)) >= i) + value += n; + } + + return value; +} + +__device__ int blockPrefixSum(int value) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + int tmp = warpPrefixSum(value); + + if (lane == warpSize-1) + shared[wid] = tmp; + __syncthreads(); + + int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + if (wid == 0) + shared[lane] = warpPrefixSum(tmp2) - shared[lane]; + __syncthreads(); + + tmp += shared[wid]; + return tmp; } \ No newline at end of file