From a8d88571df8452188365f5ef6bf31a7f6570081e Mon Sep 17 00:00:00 2001
From: Xuan Thang Nguyen <nguyexu2@fit.cvut.cz>
Date: Wed, 7 Apr 2021 23:10:02 +0200
Subject: [PATCH] file formating and switch Cmp position in function call

---
 src/quicksort/cudaPartition.cuh    | 106 ++++++++++++++---------------
 src/quicksort/quicksort.cuh        |  37 +++++-----
 src/quicksort/quicksort_1Block.cuh |  87 ++++++++++++-----------
 3 files changed, 111 insertions(+), 119 deletions(-)

diff --git a/src/quicksort/cudaPartition.cuh b/src/quicksort/cudaPartition.cuh
index fcca2c3..ebc2813 100644
--- a/src/quicksort/cudaPartition.cuh
+++ b/src/quicksort/cudaPartition.cuh
@@ -8,93 +8,92 @@ using namespace TNL;
 using namespace TNL::Containers;
 
 template <typename Value, typename Device, typename Function>
-__device__ Value pickPivot(TNL::Containers::ArrayView<Value, Device> src, const Function & Cmp)
+__device__ Value pickPivot(TNL::Containers::ArrayView<Value, Device> src, const Function &Cmp)
 {
     //return src[0];
     //return src[src.getSize()-1];
 
-    if(src.getSize() ==1)
+    if (src.getSize() == 1)
         return src[0];
-    
-    Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1];
 
-    if(Cmp(a, b)) // ..a..b..
+    Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1];
+
+    if (Cmp(a, b)) // ..a..b..
     {
-        if(Cmp(b, c))// ..a..b..c
+        if (Cmp(b, c)) // ..a..b..c
             return b;
-        else if(Cmp(c, a))//..c..a..b..
+        else if (Cmp(c, a)) //..c..a..b..
             return a;
         else //..a..c..b..
             return c;
     }
     else //..b..a..
     {
-        if(Cmp(a, c))//..b..a..c
+        if (Cmp(a, c)) //..b..a..c
             return a;
-        else if(Cmp(c, b))//..c..b..a..
+        else if (Cmp(c, b)) //..c..b..a..
             return b;
         else //..b..c..a..
             return c;
     }
-    
 }
 
 template <typename Value, typename Device, typename Function>
-__device__ Value pickPivotIdx(TNL::Containers::ArrayView<Value, Device> src, const Function & Cmp)
+__device__ Value pickPivotIdx(TNL::Containers::ArrayView<Value, Device> src, const Function &Cmp)
 {
     //return 0;
     //return src.getSize()-1;
 
-    if(src.getSize() <= 1)
+    if (src.getSize() <= 1)
         return 0;
-    
-    Value a = src[0], b = src[src.getSize()/2], c = src[src.getSize() - 1];
 
-    if(Cmp(a, b)) // ..a..b..
+    Value a = src[0], b = src[src.getSize() / 2], c = src[src.getSize() - 1];
+
+    if (Cmp(a, b)) // ..a..b..
     {
-        if(Cmp(b, c))// ..a..b..c
-            return src.getSize()/2;
-        else if(Cmp(c, a))//..c..a..b..
+        if (Cmp(b, c)) // ..a..b..c
+            return src.getSize() / 2;
+        else if (Cmp(c, a)) //..c..a..b..
             return 0;
         else //..a..c..b..
             return src.getSize() - 1;
     }
     else //..b..a..
     {
-        if(Cmp(a, c))//..b..a..c
+        if (Cmp(a, c)) //..b..a..c
             return 0;
-        else if(Cmp(c, b))//..c..b..a..
-            return src.getSize()/2;
+        else if (Cmp(c, b)) //..c..b..a..
+            return src.getSize() / 2;
         else //..b..c..a..
             return src.getSize() - 1;
     }
 }
 
 template <typename Value, typename Function>
-__device__
-void countElem(ArrayView<Value, Devices::Cuda> arr, const Function & Cmp,
-             int &smaller, int &bigger,
-             const Value &pivot)
+__device__ void countElem(ArrayView<Value, Devices::Cuda> arr,
+                          const Function &Cmp,
+                          int &smaller, int &bigger,
+                          const Value &pivot)
 {
     for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x)
     {
         const Value data = arr[i];
-        if(Cmp(data, pivot))
+        if (Cmp(data, pivot))
             smaller++;
-        else if(Cmp(pivot, data) )
+        else if (Cmp(pivot, data))
             bigger++;
     }
 }
 
 template <typename Value, typename Function>
-__device__
-void copyDataShared(ArrayView<Value, Devices::Cuda> src,
-              ArrayView<Value, Devices::Cuda> dst, const Function & Cmp,
-              Value *sharedMem,
-              int smallerStart, int biggerStart,
-              int smallerTotal, int biggerTotal,
-              int smallerOffset, int biggerOffset, //exclusive prefix sum of elements
-              const Value &pivot)
+__device__ void copyDataShared(ArrayView<Value, Devices::Cuda> src,
+                               ArrayView<Value, Devices::Cuda> dst,
+                               const Function &Cmp,
+                               Value *sharedMem,
+                               int smallerStart, int biggerStart,
+                               int smallerTotal, int biggerTotal,
+                               int smallerOffset, int biggerOffset, //exclusive prefix sum of elements
+                               const Value &pivot)
 {
 
     for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
@@ -109,7 +108,7 @@ void copyDataShared(ArrayView<Value, Devices::Cuda> src,
 
     for (int i = threadIdx.x; i < smallerTotal + biggerTotal; i += blockDim.x)
     {
-        if(i < smallerTotal)
+        if (i < smallerTotal)
             dst[smallerStart + i] = sharedMem[i];
         else
             dst[biggerStart + i - smallerTotal] = sharedMem[i];
@@ -117,17 +116,16 @@ void copyDataShared(ArrayView<Value, Devices::Cuda> src,
 }
 
 template <typename Value, typename Function>
-__device__
-void copyData(ArrayView<Value, Devices::Cuda> src,
-              ArrayView<Value, Devices::Cuda> dst,
-              const Function & Cmp, 
-              int smallerStart, int biggerStart,
-              const Value &pivot)
+__device__ void copyData(ArrayView<Value, Devices::Cuda> src,
+                         ArrayView<Value, Devices::Cuda> dst,
+                         const Function &Cmp,
+                         int smallerStart, int biggerStart,
+                         const Value &pivot)
 {
     for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
     {
         const Value data = src[i];
-        if ( Cmp(data, pivot) )
+        if (Cmp(data, pivot))
         {
             /*
             if(smallerStart >= dst.getSize() || smallerStart < 0)
@@ -135,7 +133,7 @@ void copyData(ArrayView<Value, Devices::Cuda> src,
             */
             dst[smallerStart++] = data;
         }
-        else if ( Cmp(pivot, data) )
+        else if (Cmp(pivot, data))
         {
             /*
             if(biggerStart >= dst.getSize() || biggerStart < 0)
@@ -151,10 +149,10 @@ void copyData(ArrayView<Value, Devices::Cuda> src,
 template <typename Value, typename Function, bool useShared>
 __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src,
                               ArrayView<Value, Devices::Cuda> dst,
-                              Value * sharedMem,
-                              const Function &Cmp, const Value & pivot,
-                              int elemPerBlock, TASK & task
-                              )
+                              const Function &Cmp,
+                              Value *sharedMem,
+                              const Value &pivot,
+                              int elemPerBlock, TASK &task)
 {
     static __shared__ int smallerStart, biggerStart;
 
@@ -167,7 +165,7 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src,
 
     int smaller = 0, bigger = 0;
     countElem(srcView, Cmp, smaller, bigger, pivot);
-    
+
     int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
     int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
 
@@ -179,7 +177,7 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src,
     __syncthreads();
 
     //-----------------------------------------------------------
-    if(useShared)
+    if (useShared)
     {
         static __shared__ int smallerTotal, biggerTotal;
         if (threadIdx.x == blockDim.x - 1)
@@ -190,10 +188,10 @@ __device__ void cudaPartition(ArrayView<Value, Devices::Cuda> src,
         __syncthreads();
 
         copyDataShared(srcView, dst, Cmp, sharedMem,
-                        smallerStart, biggerStart,
-                        smallerTotal, biggerTotal,
-                        smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
-                        pivot);
+                       smallerStart, biggerStart,
+                       smallerTotal, biggerTotal,
+                       smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
+                       pivot);
     }
     else
     {
diff --git a/src/quicksort/quicksort.cuh b/src/quicksort/quicksort.cuh
index faca018..22dd4cb 100644
--- a/src/quicksort/quicksort.cuh
+++ b/src/quicksort/quicksort.cuh
@@ -68,9 +68,9 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
 
 template <typename Value, typename Function, bool useShared>
 __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
-                                        const Function &Cmp, int elemPerBlock,
-                                        ArrayView<TASK, Devices::Cuda> tasks,
-                                        ArrayView<int, Devices::Cuda> taskMapping)
+                                      const Function &Cmp, int elemPerBlock,
+                                      ArrayView<TASK, Devices::Cuda> tasks,
+                                      ArrayView<int, Devices::Cuda> taskMapping)
 {
     extern __shared__ int externMem[];
     Value *sharedMem = (Value *)externMem;
@@ -88,8 +88,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, Array
     cudaPartition<Value, Function, useShared>(
         src.getView(myTask.partitionBegin, myTask.partitionEnd),
         dst.getView(myTask.partitionBegin, myTask.partitionEnd),
-        sharedMem,
-        Cmp, pivot, elemPerBlock, myTask);
+        Cmp, sharedMem, pivot,
+        elemPerBlock, myTask);
 }
 
 //----------------------------------------------------
@@ -153,7 +153,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
                                       int elemInShared)
 {
     extern __shared__ int externMem[];
-    Value * sharedMem = (Value *) externMem;
+    Value *sharedMem = (Value *)externMem;
 
     TASK &myTask = secondPhaseTasks[blockIdx.x];
     if (myTask.partitionEnd - myTask.partitionBegin <= 0)
@@ -162,17 +162,14 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
     auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
     auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
 
-    if(elemInShared == 0)
+    if (elemInShared == 0)
     {
-        singleBlockQuickSort<Value, Function, stackSize, false>
-            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
+        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
     }
     else
     {
-        singleBlockQuickSort<Value, Function, stackSize, true>
-            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
+        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
     }
-
 }
 
 template <typename Value, typename Function, int stackSize>
@@ -183,7 +180,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
                                       int elemInShared)
 {
     extern __shared__ int externMem[];
-    Value * sharedMem = (Value *) externMem;
+    Value *sharedMem = (Value *)externMem;
 
     TASK myTask;
     if (blockIdx.x < secondPhaseTasks1.getSize())
@@ -200,15 +197,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
     auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
     auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
 
-    if(elemInShared == 0)
+    if (elemInShared == 0)
     {
-        singleBlockQuickSort<Value, Function, stackSize, false>
-            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
+        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
     }
     else
     {
-        singleBlockQuickSort<Value, Function, stackSize, true>
-            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
+        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
     }
 }
 
@@ -366,7 +361,7 @@ void QUICKSORT<Value>::firstPhase(const Function &Cmp)
 
         auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
         int externMemByteSize = elemPerBlock * sizeof(Value);
-        
+
         /**
          * check if can partition using shared memory for coalesced read and write
          * 1st phase of partitioning
@@ -424,7 +419,7 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp)
 
     int elemInShared = desiredElemPerBlock;
     int externSharedByteSize = sizeof(Value) * elemInShared;
-    if(externSharedByteSize > maxSharable)
+    if (externSharedByteSize > maxSharable)
     {
         externSharedByteSize = 0;
         elemInShared = 0;
@@ -563,7 +558,7 @@ void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp)
 
     assert(blockDim * multiplier * sizeof(Value) <= maxSharable);
 
-    QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier*blockDim, maxSharable);
+    QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable);
     sorter.sort(Cmp);
 }
 
diff --git a/src/quicksort/quicksort_1Block.cuh b/src/quicksort/quicksort_1Block.cuh
index 6503ab3..e63e4e0 100644
--- a/src/quicksort/quicksort_1Block.cuh
+++ b/src/quicksort/quicksort_1Block.cuh
@@ -11,35 +11,34 @@ using namespace TNL::Containers;
 
 template <typename Value, typename Function>
 __device__ void externSort(ArrayView<Value, TNL::Devices::Cuda> src,
-                        ArrayView<Value, TNL::Devices::Cuda> dst,
-                        Value * sharedMem,
-                        const Function & Cmp)
+                           ArrayView<Value, TNL::Devices::Cuda> dst,
+                           const Function &Cmp, Value *sharedMem)
 {
     bitonicSort_Block(src, dst, sharedMem, Cmp);
 }
 
-template<int stackSize>
+template <int stackSize>
 __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
-                            int stackDepth[], int & stackTop,
-                            int begin, int pivotBegin,
-                            int pivotEnd, int end,
-                            int depth)
+                          int stackDepth[], int &stackTop,
+                          int begin, int pivotBegin,
+                          int pivotEnd, int end,
+                          int depth)
 {
     int sizeL = pivotBegin - begin, sizeR = end - pivotEnd;
-    
+
     //push the bigger one 1st and then smaller one 2nd
     //in next iteration, the smaller part will be handled 1st
-    if(sizeL > sizeR)
+    if (sizeL > sizeR)
     {
-        if(sizeL > 0) //left from pivot are smaller elems
+        if (sizeL > 0) //left from pivot are smaller elems
         {
             stackArrBegin[stackTop] = begin;
             stackArrEnd[stackTop] = pivotBegin;
             stackDepth[stackTop] = depth + 1;
             stackTop++;
         }
-        
-        if(sizeR > 0) //right from pivot until end are elem greater than pivot
+
+        if (sizeR > 0) //right from pivot until end are elem greater than pivot
         {
             assert(stackTop < stackSize && "Local quicksort stack overflow.");
 
@@ -51,7 +50,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
     }
     else
     {
-        if(sizeR > 0) //right from pivot until end are elem greater than pivot
+        if (sizeR > 0) //right from pivot until end are elem greater than pivot
         {
             stackArrBegin[stackTop] = pivotEnd;
             stackArrEnd[stackTop] = end;
@@ -59,7 +58,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
             stackTop++;
         }
 
-        if(sizeL > 0) //left from pivot are smaller elems
+        if (sizeL > 0) //left from pivot are smaller elems
         {
             assert(stackTop < stackSize && "Local quicksort stack overflow.");
 
@@ -73,14 +72,14 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
 
 template <typename Value, typename Function, int stackSize, bool useShared>
 __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
-                                    ArrayView<Value, TNL::Devices::Cuda> aux,
-                                    const Function & Cmp, int _depth,
-                                    Value * sharedMem, int memSize)
+                                     ArrayView<Value, TNL::Devices::Cuda> aux,
+                                     const Function &Cmp, int _depth,
+                                     Value *sharedMem, int memSize)
 {
-    if(arr.getSize() <= blockDim.x*2)
+    if (arr.getSize() <= blockDim.x * 2)
     {
-        auto src = (_depth &1) == 0? arr : aux;
-        externSort<Value, Function>(src, arr, sharedMem, Cmp);
+        auto src = (_depth & 1) == 0 ? arr : aux;
+        externSort<Value, Function>(src, arr, Cmp, sharedMem);
         return;
     }
 
@@ -100,33 +99,33 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
     }
     __syncthreads();
 
-    while(stackTop > 0)
+    while (stackTop > 0)
     {
         //pick up partition to break up
         if (threadIdx.x == 0)
         {
-            begin = stackArrBegin[stackTop-1];
-            end = stackArrEnd[stackTop-1];
-            depth = stackDepth[stackTop-1];
+            begin = stackArrBegin[stackTop - 1];
+            end = stackArrEnd[stackTop - 1];
+            depth = stackDepth[stackTop - 1];
             stackTop--;
         }
         __syncthreads();
 
         int size = end - begin;
-        auto &src = (depth&1) == 0 ? arr : aux;
+        auto &src = (depth & 1) == 0 ? arr : aux;
 
         //small enough for for bitonic
-        if(size <= blockDim.x*2)
+        if (size <= blockDim.x * 2)
         {
-            externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), sharedMem, Cmp);
+            externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem);
             __syncthreads();
             continue;
         }
         //------------------------------------------------------
 
         //actually do partitioning from here on out
-        if(threadIdx.x == 0)
-            pivot = pickPivot(src.getView(begin, end),Cmp);
+        if (threadIdx.x == 0)
+            pivot = pickPivot(src.getView(begin, end), Cmp);
         __syncthreads();
 
         int smaller = 0, bigger = 0;
@@ -148,9 +147,9 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
          * move elements, either use shared mem for coalesced access or without shared mem if data is too big
          * */
 
-        auto &dst = (depth&1) == 0 ? aux : arr;
+        auto &dst = (depth & 1) == 0 ? aux : arr;
 
-        if(useShared && size <= memSize)
+        if (useShared && size <= memSize)
         {
             static __shared__ int smallerTotal, biggerTotal;
             if (threadIdx.x == blockDim.x - 1)
@@ -160,17 +159,17 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
             }
             __syncthreads();
 
-            copyDataShared(src.getView(begin, end), dst.getView(begin, end), Cmp,
-                sharedMem,
-                0, pivotEnd,
-                smallerTotal, biggerTotal,
-                smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
-                pivot);
+            copyDataShared(src.getView(begin, end), dst.getView(begin, end),
+                           Cmp, sharedMem,
+                           0, pivotEnd,
+                           smallerTotal, biggerTotal,
+                           smallerPrefSumInc - smaller, biggerPrefSumInc - bigger, //exclusive prefix sum of elements
+                           pivot);
         }
         else
         {
             int destSmaller = 0 + (smallerPrefSumInc - smaller);
-            int destBigger = pivotEnd  + (biggerPrefSumInc - bigger);
+            int destBigger = pivotEnd + (biggerPrefSumInc - bigger);
 
             copyData(src.getView(begin, end), dst.getView(begin, end), Cmp, destSmaller, destBigger, pivot);
         }
@@ -181,13 +180,13 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
             arr[begin + i] = pivot;
 
         //creates new tasks
-        if(threadIdx.x == 0)
+        if (threadIdx.x == 0)
         {
             stackPush<stackSize>(stackArrBegin, stackArrEnd, stackDepth, stackTop,
-                    begin, begin+ pivotBegin,
-                    begin +pivotEnd, end,
-                    depth);
+                                 begin, begin + pivotBegin,
+                                 begin + pivotEnd, end,
+                                 depth);
         }
         __syncthreads(); //sync to update stackTop
-    } //ends while loop
+    }                    //ends while loop
 }
\ No newline at end of file
-- 
GitLab