Skip to content
Snippets Groups Projects
Commit 9328c524 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

parallel block wide prefix sum

parent 7d6e4313
No related branches found
No related tags found
No related merge requests found
......@@ -34,4 +34,38 @@ __device__ int blockReduceSum(int val)
__syncthreads();
return shared[0];
}
__device__ int warpPrefixSum(int value)
{
int laneId = threadIdx.x & 0x1f;
for (int i = 1; i*2 <= warpSize; i *= 2)
{
int n = __shfl_up_sync(0xffffffff, value, i);
if ((laneId & (warpSize - 1)) >= i)
value += n;
}
return value;
}
__device__ int blockPrefixSum(int value)
{
static __shared__ int shared[32];
int lane = threadIdx.x & (warpSize - 1);
int wid = threadIdx.x / warpSize;
int tmp = warpPrefixSum(value);
if (lane == warpSize-1)
shared[wid] = tmp;
__syncthreads();
int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;
if (wid == 0)
shared[lane] = warpPrefixSum(tmp2) - shared[lane];
__syncthreads();
tmp += shared[wid];
return tmp;
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment