From 45bf0dae19110fcbc455143a040cd81b983c308c Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen <nguyexu2@fit.cvut.cz> Date: Mon, 10 May 2021 20:26:21 +0200 Subject: [PATCH] roll back 1st step --- GPUSort/src/bitonicSort/bitonicSort.h | 48 +++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/GPUSort/src/bitonicSort/bitonicSort.h b/GPUSort/src/bitonicSort/bitonicSort.h index 72a1c78..0ca7e5e 100644 --- a/GPUSort/src/bitonicSort/bitonicSort.h +++ b/GPUSort/src/bitonicSort/bitonicSort.h @@ -108,15 +108,49 @@ template <typename Value, typename CMP> __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp) { extern __shared__ int externMem[]; - int sharedMemLen = 2 * blockDim.x; + + Value * sharedMem = (Value *)externMem; + int sharedMemLen = 2*blockDim.x; + int myBlockStart = blockIdx.x * sharedMemLen; - int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen); + int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen); - if (blockIdx.x % 2 || blockIdx.x + 1 == gridDim.x) - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp); - else - bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, - [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); + //copy from globalMem into sharedMem + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + sharedMem[i] = arr[myBlockStart + i]; + __syncthreads(); + + //------------------------------------------ + //bitonic activity + { + int i = blockIdx.x * blockDim.x + threadIdx.x; + int paddedSize = closestPow2(myBlockEnd - myBlockStart); + + for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) + { + //calculate the direction of swapping + int monotonicSeqIdx = i / (monotonicSeqLen/2); + bool ascending = (monotonicSeqIdx & 1) != 0; + if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for parts with no "partner" + ascending = true; + + for (int len = monotonicSeqLen; len > 1; len /= 2) + { + //calculates which 2 indexes will be compared and swap + int part = threadIdx.x / (len / 2); + int s = part * len + (threadIdx.x & ((len / 2) - 1)); + int e = s + len / 2; + + if(e < myBlockEnd - myBlockStart) //touching virtual padding + cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp); + __syncthreads(); + } + } + } + + //writeback to global memory + for (int i = threadIdx.x; myBlockStart + i < myBlockEnd; i += blockDim.x) + arr[myBlockStart + i] = sharedMem[i]; } //--------------------------------------------- -- GitLab