From a8d88571df8452188365f5ef6bf31a7f6570081e Mon Sep 17 00:00:00 2001 From: Xuan Thang Nguyen <nguyexu2@fit.cvut.cz> Date: Wed, 7 Apr 2021 23:10:02 +0200 Subject: [PATCH] file formating and switch Cmp position in function call --- src/quicksort/cudaPartition.cuh | 106 ++++++++++++++--------------- src/quicksort/quicksort.cuh | 37 +++++----- src/quicksort/quicksort_1Block.cuh | 87 ++++++++++++----------- 3 files changed, 111 insertions(+), 119 deletions(-) diff --git a/src/quicksort/cudaPartition.cuh b/src/quicksort/cudaPartition.cuh index fcca2c3..ebc2813 100644 --- a/src/quicksort/cudaPartition.cuh +++ b/src/quicksort/cudaPartition.cuh @@ -8,93 +8,92 @@ using namespace TNL; using namespace TNL::Containers; template <typename Value, typename Device, typename Function> -__device__ Value pickPivot(TNL::Containers::ArrayView<Value, Device> src, const Function & Cmp) +__device__ Value pickPivot(TNL::Containers::ArrayView<Value, Device> src, const Function &Cmp) { //return src[0]; //return src[src.getSize()-1]; - if(src.getSize() ==1) + if (src.getSize() == 1) return src[0]; - - Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; - if(Cmp(a, b)) // ..a..b.. + Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. { - if(Cmp(b, c))// ..a..b..c + if (Cmp(b, c)) // ..a..b..c return b; - else if(Cmp(c, a))//..c..a..b.. + else if (Cmp(c, a)) //..c..a..b.. return a; else //..a..c..b.. return c; } else //..b..a.. { - if(Cmp(a, c))//..b..a..c + if (Cmp(a, c)) //..b..a..c return a; - else if(Cmp(c, b))//..c..b..a.. + else if (Cmp(c, b)) //..c..b..a.. return b; else //..b..c..a.. return c; } - } template <typename Value, typename Device, typename Function> -__device__ Value pickPivotIdx(TNL::Containers::ArrayView<Value, Device> src, const Function & Cmp) +__device__ Value pickPivotIdx(TNL::Containers::ArrayView<Value, Device> src, const Function &Cmp) { //return 0; //return src.getSize()-1; - if(src.getSize() <= 1) + if (src.getSize() <= 1) return 0; - - Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1]; - if(Cmp(a, b)) // ..a..b.. + Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1]; + + if (Cmp(a, b)) // ..a..b.. { - if(Cmp(b, c))// ..a..b..c - return src.getSize()/2; - else if(Cmp(c, a))//..c..a..b.. + if (Cmp(b, c)) // ..a..b..c + return src.getSize() / 2; + else if (Cmp(c, a)) //..c..a..b.. return 0; else //..a..c..b.. return src.getSize() - 1; } else //..b..a.. { - if(Cmp(a, c))//..b..a..c + if (Cmp(a, c)) //..b..a..c return 0; - else if(Cmp(c, b))//..c..b..a.. - return src.getSize()/2; + else if (Cmp(c, b)) //..c..b..a.. + return src.getSize() / 2; else //..b..c..a.. return src.getSize() - 1; } } template <typename Value, typename Function> -__device__ -void countElem(ArrayView<Value, Devices::Cuda> arr, const Function & Cmp, - int &smaller, int &bigger, - const Value &pivot) +__device__ void countElem(ArrayView<Value, Devices::Cuda> arr, + const Function &Cmp, + int &smaller, int &bigger, + const Value &pivot) { for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x) { const Value data = arr[i]; - if(Cmp(data, pivot)) + if (Cmp(data, pivot)) smaller++; - else if(Cmp(pivot, data) ) + else if (Cmp(pivot, data)) bigger++; } } template <typename Value, typename Function> -__device__ -void copyDataShared(ArrayView<Value, Devices::Cuda> src, - ArrayView<Value, Devices::Cuda> dst, const Function & Cmp, - Value *sharedMem, - int smallerStart, int biggerStart, - int smallerTotal, int biggerTotal, - int smallerOffset, int biggerOffset, //exclusive prefix sum of elements - const Value &pivot) +__device__ void copyDataShared(ArrayView<Value, Devices::Cuda> src, + ArrayView<Value, Devices::Cuda> dst, + const Function &Cmp, + Value *sharedMem, + int smallerStart, int biggerStart, + int smallerTotal, int biggerTotal, + int smallerOffset, int biggerOffset, //exclusive prefix sum of elements + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) @@ -109,7 +108,7 @@ void copyDataShared(ArrayView<Value, Devices::Cuda> src, for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x) { - if(i < smallerTotal) + if (i < smallerTotal) dst[smallerStart + i] = sharedMem[i]; else dst[biggerStart + i - smallerTotal] = sharedMem[i]; @@ -117,17 +116,16 @@ void copyDataShared(ArrayView<Value, Devices::Cuda> src, } template <typename Value, typename Function> -__device__ -void copyData(ArrayView<Value, Devices::Cuda> src, - ArrayView<Value, Devices::Cuda> dst, - const Function & Cmp, - int smallerStart, int biggerStart, - const Value &pivot) +__device__ void copyData(ArrayView<Value, Devices::Cuda> src, + ArrayView<Value, Devices::Cuda> dst, + const Function &Cmp, + int smallerStart, int biggerStart, + const Value &pivot) { for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x) { const Value data = src[i]; - if ( Cmp(data, pivot) ) + if (Cmp(data, pivot)) { /* if(smallerStart >= dst.getSize() || smallerStart < 0) @@ -135,7 +133,7 @@ void copyData(ArrayView<Value, Devices::Cuda> src, */ dst[smallerStart++] = data; } - else if ( Cmp(pivot, data) ) + else if (Cmp(pivot, data)) { /* if(biggerStart >= dst.getSize() || biggerStart < 0) @@ -151,10 +149,10 @@ void copyData(ArrayView<Value, Devices::Cuda> src, template <typename Value, typename Function, bool useShared> __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src, ArrayView<Value, Devices::Cuda> dst, - Value * sharedMem, - const Function &Cmp, const Value & pivot, - int elemPerBlock, TASK & task - ) + const Function &Cmp, + Value *sharedMem, + const Value &pivot, + int elemPerBlock, TASK &task) { static __shared__ int smallerStart, biggerStart; @@ -167,7 +165,7 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src, int smaller = 0, bigger = 0; countElem(srcView, Cmp, smaller, bigger, pivot); - + int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); @@ -179,7 +177,7 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src, __syncthreads(); //----------------------------------------------------------- - if(useShared) + if (useShared) { static __shared__ int smallerTotal, biggerTotal; if (threadIdx.x == blockDim.x - 1) @@ -190,10 +188,10 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src, __syncthreads(); copyDataShared(srcView, dst, Cmp, sharedMem, - smallerStart, biggerStart, - smallerTotal, biggerTotal, - smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements - pivot); + smallerStart, biggerStart, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); } else { diff --git a/src/quicksort/quicksort.cuh b/src/quicksort/quicksort.cuh index faca018..22dd4cb 100644 --- a/src/quicksort/quicksort.cuh +++ b/src/quicksort/quicksort.cuh @@ -68,9 +68,9 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha template <typename Value, typename Function, bool useShared> __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, - const Function &Cmp, int elemPerBlock, - ArrayView<TASK, Devices::Cuda> tasks, - ArrayView<int, Devices::Cuda> taskMapping) + const Function &Cmp, int elemPerBlock, + ArrayView<TASK, Devices::Cuda> tasks, + ArrayView<int, Devices::Cuda> taskMapping) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; @@ -88,8 +88,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, Array cudaPartition<Value, Function, useShared>( src.getView(myTask.partitionBegin, myTask.partitionEnd), dst.getView(myTask.partitionBegin, myTask.partitionEnd), - sharedMem, - Cmp, pivot, elemPerBlock, myTask); + Cmp, sharedMem, pivot, + elemPerBlock, myTask); } //---------------------------------------------------- @@ -153,7 +153,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array int elemInShared) { extern __shared__ int externMem[]; - Value * sharedMem = (Value *) externMem; + Value *sharedMem = (Value *)externMem; TASK &myTask = secondPhaseTasks[blockIdx.x]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) @@ -162,17 +162,14 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - if(elemInShared == 0) + if (elemInShared == 0) { - singleBlockQuickSort<Value, Function, stackSize, false> - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { - singleBlockQuickSort<Value, Function, stackSize, true> - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } - } template <typename Value, typename Function, int stackSize> @@ -183,7 +180,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array int elemInShared) { extern __shared__ int externMem[]; - Value * sharedMem = (Value *) externMem; + Value *sharedMem = (Value *)externMem; TASK myTask; if (blockIdx.x < secondPhaseTasks1.getSize()) @@ -200,15 +197,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); - if(elemInShared == 0) + if (elemInShared == 0) { - singleBlockQuickSort<Value, Function, stackSize, false> - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { - singleBlockQuickSort<Value, Function, stackSize, true> - (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); + singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } } @@ -366,7 +361,7 @@ void QUICKSORT<Value>::firstPhase(const Function &Cmp) auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int externMemByteSize = elemPerBlock * sizeof(Value); - + /** * check if can partition using shared memory for coalesced read and write * 1st phase of partitioning @@ -424,7 +419,7 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp) int elemInShared = desiredElemPerBlock; int externSharedByteSize = sizeof(Value) * elemInShared; - if(externSharedByteSize > maxSharable) + if (externSharedByteSize > maxSharable) { externSharedByteSize = 0; elemInShared = 0; @@ -563,7 +558,7 @@ void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp) assert(blockDim * multiplier * sizeof(Value) <= maxSharable); - QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier*blockDim, maxSharable); + QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); sorter.sort(Cmp); } diff --git a/src/quicksort/quicksort_1Block.cuh b/src/quicksort/quicksort_1Block.cuh index 6503ab3..e63e4e0 100644 --- a/src/quicksort/quicksort_1Block.cuh +++ b/src/quicksort/quicksort_1Block.cuh @@ -11,35 +11,34 @@ using namespace TNL::Containers; template <typename Value, typename Function> __device__ void externSort(ArrayView<Value, TNL::Devices::Cuda> src, - ArrayView<Value, TNL::Devices::Cuda> dst, - Value * sharedMem, - const Function & Cmp) + ArrayView<Value, TNL::Devices::Cuda> dst, + const Function &Cmp, Value *sharedMem) { bitonicSort_Block(src, dst, sharedMem, Cmp); } -template<int stackSize> +template <int stackSize> __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], - int stackDepth[], int & stackTop, - int begin, int pivotBegin, - int pivotEnd, int end, - int depth) + int stackDepth[], int &stackTop, + int begin, int pivotBegin, + int pivotEnd, int end, + int depth) { int sizeL = pivotBegin - begin, sizeR = end - pivotEnd; - + //push the bigger one 1st and then smaller one 2nd //in next iteration, the smaller part will be handled 1st - if(sizeL > sizeR) + if (sizeL > sizeR) { - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { stackArrBegin[stackTop] = begin; stackArrEnd[stackTop] = pivotBegin; stackDepth[stackTop] = depth + 1; stackTop++; } - - if(sizeR > 0) //right from pivot until end are elem greater than pivot + + if (sizeR > 0) //right from pivot until end are elem greater than pivot { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -51,7 +50,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], } else { - if(sizeR > 0) //right from pivot until end are elem greater than pivot + if (sizeR > 0) //right from pivot until end are elem greater than pivot { stackArrBegin[stackTop] = pivotEnd; stackArrEnd[stackTop] = end; @@ -59,7 +58,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], stackTop++; } - if(sizeL > 0) //left from pivot are smaller elems + if (sizeL > 0) //left from pivot are smaller elems { assert(stackTop < stackSize && "Local quicksort stack overflow."); @@ -73,14 +72,14 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[], template <typename Value, typename Function, int stackSize, bool useShared> __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, - ArrayView<Value, TNL::Devices::Cuda> aux, - const Function & Cmp, int _depth, - Value * sharedMem, int memSize) + ArrayView<Value, TNL::Devices::Cuda> aux, + const Function &Cmp, int _depth, + Value *sharedMem, int memSize) { - if(arr.getSize() <= blockDim.x*2) + if (arr.getSize() <= blockDim.x * 2) { - auto src = (_depth &1) == 0? arr : aux; - externSort<Value, Function>(src, arr, sharedMem, Cmp); + auto src = (_depth & 1) == 0 ? arr : aux; + externSort<Value, Function>(src, arr, Cmp, sharedMem); return; } @@ -100,33 +99,33 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, } __syncthreads(); - while(stackTop > 0) + while (stackTop > 0) { //pick up partition to break up if (threadIdx.x == 0) { - begin = stackArrBegin[stackTop-1]; - end = stackArrEnd[stackTop-1]; - depth = stackDepth[stackTop-1]; + begin = stackArrBegin[stackTop - 1]; + end = stackArrEnd[stackTop - 1]; + depth = stackDepth[stackTop - 1]; stackTop--; } __syncthreads(); int size = end - begin; - auto &src = (depth&1) == 0 ? arr : aux; + auto &src = (depth & 1) == 0 ? arr : aux; //small enough for for bitonic - if(size <= blockDim.x*2) + if (size <= blockDim.x * 2) { - externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), sharedMem, Cmp); + externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); __syncthreads(); continue; } //------------------------------------------------------ //actually do partitioning from here on out - if(threadIdx.x == 0) - pivot = pickPivot(src.getView(begin, end),Cmp); + if (threadIdx.x == 0) + pivot = pickPivot(src.getView(begin, end), Cmp); __syncthreads(); int smaller = 0, bigger = 0; @@ -148,9 +147,9 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, * move elements, either use shared mem for coalesced access or without shared mem if data is too big * */ - auto &dst = (depth&1) == 0 ? aux : arr; + auto &dst = (depth & 1) == 0 ? aux : arr; - if(useShared && size <= memSize) + if (useShared && size <= memSize) { static __shared__ int smallerTotal, biggerTotal; if (threadIdx.x == blockDim.x - 1) @@ -160,17 +159,17 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, } __syncthreads(); - copyDataShared(src.getView(begin, end), dst.getView(begin, end), Cmp, - sharedMem, - 0, pivotEnd, - smallerTotal, biggerTotal, - smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements - pivot); + copyDataShared(src.getView(begin, end), dst.getView(begin, end), + Cmp, sharedMem, + 0, pivotEnd, + smallerTotal, biggerTotal, + smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements + pivot); } else { int destSmaller = 0 + (smallerPrefSumInc - smaller); - int destBigger = pivotEnd + (biggerPrefSumInc - bigger); + int destBigger = pivotEnd + (biggerPrefSumInc - bigger); copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot); } @@ -181,13 +180,13 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, arr[begin + i] = pivot; //creates new tasks - if(threadIdx.x == 0) + if (threadIdx.x == 0) { stackPush<stackSize>(stackArrBegin, stackArrEnd, stackDepth, stackTop, - begin, begin+ pivotBegin, - begin +pivotEnd, end, - depth); + begin, begin + pivotBegin, + begin + pivotEnd, end, + depth); } __syncthreads(); //sync to update stackTop - } //ends while loop + } //ends while loop } \ No newline at end of file -- GitLab