From 9328c524be76a185c08b94b3a396bfbdef5d17b0 Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen <nguyexu2@fit.cvut.cz> Date: Thu, 4 Mar 2021 01:03:22 +0100 Subject: [PATCH] parallel block wide prefix sum --- quicksort/reduction.cuh | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/quicksort/reduction.cuh b/quicksort/reduction.cuh index 3efb309..48d3ed3 100644 --- a/quicksort/reduction.cuh +++ b/quicksort/reduction.cuh @@ -34,4 +34,38 @@ __device__ int blockReduceSum(int val) __syncthreads(); return shared[0]; +} + +__device__ int warpPrefixSum(int value) +{ + int laneId = threadIdx.x & 0x1f; + for (int i = 1; i*2 <= warpSize; i *= 2) + { + int n = __shfl_up_sync(0xffffffff, value, i); + if ((laneId & (warpSize - 1)) >= i) + value += n; + } + + return value; +} + +__device__ int blockPrefixSum(int value) +{ + static __shared__ int shared[32]; + int lane = threadIdx.x & (warpSize - 1); + int wid = threadIdx.x / warpSize; + + int tmp = warpPrefixSum(value); + + if (lane == warpSize-1) + shared[wid] = tmp; + __syncthreads(); + + int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; + if (wid == 0) + shared[lane] = warpPrefixSum(tmp2) - shared[lane]; + __syncthreads(); + + tmp += shared[wid]; + return tmp; } \ No newline at end of file -- GitLab