Commit 2f143034 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

refactor depth -> iteration

parent 4194c809
Loading
Loading
Loading
Loading
+16 −16
Original line number Diff line number Diff line
@@ -31,27 +31,27 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
                          int stackDepth[], int &stackTop,
                          int begin, int pivotBegin,
                          int pivotEnd, int end,
                          int depth);
                          int iteration);

//---------------------------------------------------------------

template <typename Value, typename CMP, int stackSize, bool useShared>
__device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
                                     ArrayView<Value, TNL::Devices::Cuda> aux,
                                     const CMP &Cmp, int _depth,
                                     const CMP &Cmp, int _iteration,
                                     Value *sharedMem, int memSize,
                                     int maxBitonicSize)
{
    if (arr.getSize() <= maxBitonicSize)
    {
        auto &src = (_depth & 1) == 0 ? arr : aux;
        auto &src = (_iteration & 1) == 0 ? arr : aux;
        if (useShared && arr.getSize() <= memSize)
            externSort<Value, CMP>(src, arr, Cmp, sharedMem);
        else
        {
            externSort<Value, CMP>(src, Cmp);
            //extern sort without shared memory only works in-place, need to copy into from aux
            if ((_depth & 1) != 0)
            if ((_iteration & 1) != 0)
                for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x)
                    arr[i] = src[i];
        }
@@ -61,7 +61,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,

    static __shared__ int stackTop;
    static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize];
    static __shared__ int begin, end, depth;
    static __shared__ int begin, end, iteration;
    static __shared__ int pivotBegin, pivotEnd;
    Value *piv = sharedMem;
    sharedMem += 1;
@@ -71,7 +71,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
        stackTop = 0;
        stackArrBegin[stackTop] = 0;
        stackArrEnd[stackTop] = arr.getSize();
        stackDepth[stackTop] = _depth;
        stackDepth[stackTop] = _iteration;
        stackTop++;
    }
    __syncthreads();
@@ -83,13 +83,13 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
        {
            begin = stackArrBegin[stackTop - 1];
            end = stackArrEnd[stackTop - 1];
            depth = stackDepth[stackTop - 1];
            iteration = stackDepth[stackTop - 1];
            stackTop--;
        }
        __syncthreads();

        int size = end - begin;
        auto &src = (depth & 1) == 0 ? arr : aux;
        auto &src = (iteration & 1) == 0 ? arr : aux;

        //small enough for for bitonic
        if (size <= maxBitonicSize)
@@ -100,7 +100,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
            {
                externSort<Value, CMP>(src.getView(begin, end), Cmp);
                //extern sort without shared memory only works in-place, need to copy into from aux
                if ((depth & 1) != 0)
                if ((iteration & 1) != 0)
                    for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
                        arr[begin + i] = src[i];
            }
@@ -134,7 +134,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
         * move elements, either use shared mem for coalesced access or without shared mem if data is too big
         * */

        auto &dst = (depth & 1) == 0 ? aux : arr;
        auto &dst = (iteration & 1) == 0 ? aux : arr;

        if (useShared && size <= memSize)
        {
@@ -172,7 +172,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
            stackPush<stackSize>(stackArrBegin, stackArrEnd, stackDepth, stackTop,
                                 begin, begin + pivotBegin,
                                 begin + pivotEnd, end,
                                 depth);
                                 iteration);
        }
        __syncthreads(); //sync to update stackTop
    }                    //ends while loop
@@ -185,7 +185,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
                          int stackDepth[], int &stackTop,
                          int begin, int pivotBegin,
                          int pivotEnd, int end,
                          int depth)
                          int iteration)
{
    int sizeL = pivotBegin - begin, sizeR = end - pivotEnd;

@@ -197,7 +197,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
        {
            stackArrBegin[stackTop] = begin;
            stackArrEnd[stackTop] = pivotBegin;
            stackDepth[stackTop] = depth + 1;
            stackDepth[stackTop] = iteration + 1;
            stackTop++;
        }

@@ -207,7 +207,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],

            stackArrBegin[stackTop] = pivotEnd;
            stackArrEnd[stackTop] = end;
            stackDepth[stackTop] = depth + 1;
            stackDepth[stackTop] = iteration + 1;
            stackTop++;
        }
    }
@@ -217,7 +217,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
        {
            stackArrBegin[stackTop] = pivotEnd;
            stackArrEnd[stackTop] = end;
            stackDepth[stackTop] = depth + 1;
            stackDepth[stackTop] = iteration + 1;
            stackTop++;
        }

@@ -227,7 +227,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],

            stackArrBegin[stackTop] = begin;
            stackArrEnd[stackTop] = pivotBegin;
            stackDepth[stackTop] = depth + 1;
            stackDepth[stackTop] = iteration + 1;
            stackTop++;
        }
    }
+15 −15
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@ using namespace TNL::Containers;

//-----------------------------------------------------------

__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
                             ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
                             ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt);

@@ -67,8 +67,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, Array
    Value *sharedMem = piv + 1;

    TASK &myTask = tasks[taskMapping[blockIdx.x]];
    auto &src = (myTask.depth & 1) == 0 ? arr : aux;
    auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
    auto &src = (myTask.iteration & 1) == 0 ? arr : aux;
    auto &dst = (myTask.iteration & 1) == 0 ? aux : arr;

    if (threadIdx.x == 0)
        *piv = src[myTask.pivotIdx];
@@ -95,7 +95,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
    TASK &myTask = tasks[blockIdx.x];

    if (threadIdx.x == 0)
        *piv = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
        *piv = (myTask.iteration & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
    __syncthreads();
    Value &pivot = *piv;

@@ -117,7 +117,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va

    if (leftEnd - leftBegin > 0)
    {
        writeNewTask(leftBegin, leftEnd, myTask.depth,
        writeNewTask(leftBegin, leftEnd, myTask.iteration,
                     maxElemFor2ndPhase,
                     newTasks, newTasksCnt,
                     secondPhaseTasks, secondPhaseTasksCnt);
@@ -126,7 +126,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
    if (rightEnd - rightBegin > 0)
    {
        writeNewTask(rightBegin, rightEnd,
                     myTask.depth, maxElemFor2ndPhase,
                     myTask.iteration, maxElemFor2ndPhase,
                     newTasks, newTasksCnt,
                     secondPhaseTasks, secondPhaseTasksCnt);
    }
@@ -134,7 +134,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va

//-----------------------------------------------------------

__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
                             ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
                             ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
@@ -152,13 +152,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
    {
        int idx = atomicAdd(secondPhaseTasksCnt, 1);
        if (idx < secondPhaseTasks.getSize())
            secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
            secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
        else
        {
            //printf("ran out of memory, trying backup\n");
            int idx = atomicAdd(newTasksCnt, 1);
            if (idx < newTasks.getSize())
                newTasks[idx] = TASK(begin, end, depth + 1);
                newTasks[idx] = TASK(begin, end, iteration + 1);
            else
                printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n");
        }
@@ -167,13 +167,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
    {
        int idx = atomicAdd(newTasksCnt, 1);
        if (idx < newTasks.getSize())
            newTasks[idx] = TASK(begin, end, depth + 1);
            newTasks[idx] = TASK(begin, end, iteration + 1);
        else
        {
            //printf("ran out of memory, trying backup\n");
            int idx = atomicAdd(secondPhaseTasksCnt, 1);
            if (idx < secondPhaseTasks.getSize())
                secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
                secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
            else
                printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n");
        }
@@ -203,11 +203,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

    if (elemInShared == 0)
    {
        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
    }
    else
    {
        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
    }
}

@@ -238,11 +238,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

    if (elemInShared <= 0)
    {
        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
        singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
    }
    else
    {
        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
        singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
    }
}

+4 −4
Original line number Diff line number Diff line
@@ -7,15 +7,15 @@ struct TASK
    //-----------------------------------------------
    //helper variables for blocks working on this task

    int depth;
    int iteration;
    int pivotIdx;
    int dstBegin, dstEnd;
    int firstBlock, blockCount;//for workers read only values

    __cuda_callable__
    TASK(int begin, int end, int depth)
    TASK(int begin, int end, int iteration)
        : partitionBegin(begin), partitionEnd(end),
        depth(depth), pivotIdx(-1),
        iteration(iteration), pivotIdx(-1),
        dstBegin(-151561), dstEnd(-151561),
        firstBlock(-100), blockCount(-100)
        {}
@@ -42,7 +42,7 @@ std::ostream& operator<<(std::ostream & out, const TASK & task)
{
    out << "[ ";
    out << task.partitionBegin << " - " << task.partitionEnd;
    out << " | " << "depth: " << task.depth;
    out << " | " << "iteration: " << task.iteration;
    out << " | " << "pivotIdx: " << task.pivotIdx;
    return out << " ] ";
}
 No newline at end of file