Loading quicksort/quicksort.cuh +2 −2 Original line number Diff line number Diff line Loading @@ -49,8 +49,8 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); int smallerOffset = blockPrefixSum(smaller); int biggerOffset = blockPrefixSum(bigger); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) { Loading quicksort/reduction.cuh +4 −4 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ __device__ int blockReduceSum(int val) return shared[0]; } __device__ int warpPrefixSum(int value) __device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & 0x1f; for (int i = 1; i*2 <= warpSize; i *= 2) Loading @@ -49,13 +49,13 @@ __device__ int warpPrefixSum(int value) return value; } __device__ int blockPrefixSum(int value) __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; int tmp = warpPrefixSum(value); int tmp = warpInclusivePrefixSum(value); if (lane == warpSize-1) shared[wid] = tmp; Loading @@ -63,7 +63,7 @@ __device__ int blockPrefixSum(int value) int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) shared[lane] = warpPrefixSum(tmp2) - shared[lane]; shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); tmp += shared[wid]; Loading Loading
quicksort/quicksort.cuh +2 −2 Original line number Diff line number Diff line Loading @@ -49,8 +49,8 @@ __global__ void cudaPartition(CudaArrayView arr, int begin, int end, int smaller = 0, bigger = 0; cmpElem(arr, myBegin, myEnd, pivot, smaller, bigger); int smallerOffset = blockPrefixSum(smaller); int biggerOffset = blockPrefixSum(bigger); int smallerOffset = blockInclusivePrefixSum(smaller); int biggerOffset = blockInclusivePrefixSum(bigger); if (threadIdx.x == blockDim.x - 1) { Loading
quicksort/reduction.cuh +4 −4 Original line number Diff line number Diff line Loading @@ -36,7 +36,7 @@ __device__ int blockReduceSum(int val) return shared[0]; } __device__ int warpPrefixSum(int value) __device__ int warpInclusivePrefixSum(int value) { int laneId = threadIdx.x & 0x1f; for (int i = 1; i*2 <= warpSize; i *= 2) Loading @@ -49,13 +49,13 @@ __device__ int warpPrefixSum(int value) return value; } __device__ int blockPrefixSum(int value) __device__ int blockInclusivePrefixSum(int value) { static __shared__ int shared[32]; int lane = threadIdx.x & (warpSize - 1); int wid = threadIdx.x / warpSize; int tmp = warpPrefixSum(value); int tmp = warpInclusivePrefixSum(value); if (lane == warpSize-1) shared[wid] = tmp; Loading @@ -63,7 +63,7 @@ __device__ int blockPrefixSum(int value) int tmp2 = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0; if (wid == 0) shared[lane] = warpPrefixSum(tmp2) - shared[lane]; shared[lane] = warpInclusivePrefixSum(tmp2) - tmp2; __syncthreads(); tmp += shared[wid]; Loading