Commit a2f401f2 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

set lowerbound of single block quicksort size

parent f30dc8b6
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -134,10 +134,10 @@ void copyData(ArrayView<int, Devices::Cuda> src,
        }
        else if (data > pivot)
        {
            /*
            
            if(biggerStart >= dst.getSize() || biggerStart < 0)
                printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize());
            */
            
            dst[biggerStart++] = data;
        }
    }
+16 −16
Original line number Diff line number Diff line
@@ -15,7 +15,7 @@ using namespace TNL::Containers;

//-----------------------------------------------------------

__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx,
__device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int maxElemFor2ndPhase,
                            ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
                             ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
@@ -29,7 +29,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx,
    if (size == 0)
        return;

    if (size <= blockDim.x * 2)
    if (size <= maxElemFor2ndPhase)
    {
        int idx = atomicAdd(secondPhaseTasksCnt, 1);
        if (idx < secondPhaseTasks.getSize())
@@ -98,7 +98,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView<int, Devices::Cuda> arr, ArrayVi

template <typename Function>
__global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, Devices::Cuda> aux,
                               const Function &Cmp, int elemPerBlock,
                               const Function &Cmp, int maxElemFor2ndPhase,
                               ArrayView<TASK, Devices::Cuda> tasks,
                               ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
                               ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
@@ -138,7 +138,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int,
                            arr.getView(leftBegin, leftEnd)
                            , Cmp) + leftBegin;

        writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt);
        writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, maxElemFor2ndPhase,
            newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt);
    }

    if(rightEnd - rightBegin > 0)
@@ -148,7 +149,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int,
                                arr.getView(rightBegin, rightEnd)
                            , Cmp) + rightBegin;
                                
        writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt);
        writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, maxElemFor2ndPhase,
            newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt);
    }
}

@@ -216,6 +218,8 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k
const int g_maxTasks = 1 << 14;
const int minElemPerBlock = threadsPerBlock*2;
const int maxBitonicSize = threadsPerBlock*2;
const int desired_2ndPhasElemPerBlock = maxBitonicSize*8;

class QUICKSORT
{
@@ -283,13 +287,14 @@ void QUICKSORT::sort(const Function &Cmp)
        //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array
        if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks)
        {
            //deb("task overflow")
            break;
        }

        //just in case newly created tasks wouldnt fit
        if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
        {
            break;
        }

        int elemPerBlock = getElemPerBlock();
        int blocksCnt = initTasks(elemPerBlock);
@@ -305,7 +310,7 @@ void QUICKSORT::sort(const Function &Cmp)

        auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;
        cudaWritePivot<<<tasksAmount, 512>>>(
            arr, aux, Cmp, elemPerBlock,
            arr, aux, Cmp, desired_2ndPhasElemPerBlock,
            task,
            newTask,
            cuda_newTasksAmount.getData(),
@@ -330,14 +335,9 @@ void QUICKSORT::sort(const Function &Cmp)
    
    if (host_2ndPhaseTasksAmount > 0)
    {
        cudaStream_t s;
        cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);

        cudaQuickSort2ndPhase<Function, 128>
            <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock, 0, s>>>
            <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>
            (arr, aux, Cmp, cuda_2ndPhaseTasks);

        cudaStreamDestroy(s);
    }