From 9328c524be76a185c08b94b3a396bfbdef5d17b0 Mon Sep 17 00:00:00 2001
From: Xuan Thang Nguyen <nguyexu2@fit.cvut.cz>
Date: Thu, 4 Mar 2021 01:03:22 +0100
Subject: [PATCH] parallel block wide prefix sum

---
 quicksort/reduction.cuh | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/quicksort/reduction.cuh b/quicksort/reduction.cuh
index 3efb309..48d3ed3 100644
--- a/quicksort/reduction.cuh
+++ b/quicksort/reduction.cuh
@@ -34,4 +34,38 @@ __device__ int blockReduceSum(int val)
     __syncthreads(); 
 
     return shared[0];
+}
+
+__device__ int warpPrefixSum(int value)
+{
+    int laneId = threadIdx.x & 0x1f;
+    for (int i = 1; i*2 <= warpSize; i *= 2)
+    {
+        int n = __shfl_up_sync(0xffffffff, value, i);
+        if ((laneId & (warpSize - 1)) >= i)
+            value += n;
+    }
+
+    return value;
+}
+
+__device__ int blockPrefixSum(int value)
+{
+    static __shared__ int shared[32];
+    int lane = threadIdx.x & (warpSize - 1);
+    int wid = threadIdx.x / warpSize;
+
+    int tmp = warpPrefixSum(value);
+
+    if (lane == warpSize-1)
+        shared[wid] = tmp;
+    __syncthreads();
+
+    int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
+    if (wid == 0)
+        shared[lane] = warpPrefixSum(tmp2) - shared[lane];
+    __syncthreads();
+    
+    tmp += shared[wid];
+    return tmp;
 }
\ No newline at end of file
-- 
GitLab