diff --git a/quicksort/reduction.cuh b/quicksort/reduction.cuh index 9c528c10bffd8eb925d3c1fbfb55331c43bd6714..234871c93f0392fd73fb9e4ecc80912273573209 100644 --- a/quicksort/reduction.cuh +++ b/quicksort/reduction.cuh @@ -36,37 +36,28 @@ __device__ int blockReduceSum(int val) return shared[0]; } + +template<int it> __device__ int warpInclusivePrefixSum(int value) { - int laneId = threadIdx.x & 0x1f; - for (int i = 1; i*2 <= warpSize; i *= 2) + if(it*2 <= 32) { + int i = it; int n = __shfl_up_sync(0xffffffff, value, i); + int laneId = threadIdx.x & 0x1f; if ((laneId & (warpSize - 1)) >= i) value += n; + return warpInclusivePrefixSum<it*2 >= 32? 32 : it*2>(value); + } return value; } -/* -template<int it = 32> __device__ int warpInclusivePrefixSum(int value) { - if(it >= 2) - { - int i = it == 0? 32 : 32/it; - int n = __shfl_up_sync(0xffffffff, value, i); - int laneId = threadIdx.x & 0x1f; - if ((laneId & (warpSize - 1)) >= i) - value += n; - return warpInclusivePrefixSum<it/2>(value); - - } - - return value; + return warpInclusivePrefixSum<1>(value); } -*/ __device__ int blockInclusivePrefixSum(int value) {