Commit bccf9bb2 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

change bitonic threshold size

parent f7c80110
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -30,7 +30,7 @@ class QUICKSORT

    //--------------------------------------

    const int maxBitonicSize = threadsPerBlock * 2;
    const int maxBitonicSize = threadsPerBlock * 8;
    const int desired_2ndPhasElemPerBlock = maxBitonicSize;
    const int g_maxTasks = 1 << 14;
    int maxTasks;
@@ -284,20 +284,20 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp)
        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);

        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared);
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize);
    }
    else if (host_1stPhaseTasksAmount > 0)
    {
        auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared);
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize);
    }
    else
    {
        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);

        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared);
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize);
    }
}

+4 −3
Original line number Diff line number Diff line
@@ -39,9 +39,10 @@ template <typename Value, typename Function, int stackSize, bool useShared>
__device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
                                     ArrayView<Value, TNL::Devices::Cuda> aux,
                                     const Function &Cmp, int _depth,
                                     Value *sharedMem, int memSize)
                                     Value *sharedMem, int memSize,
                                     int maxBitonicSize)
{
    if (arr.getSize() <= blockDim.x * 2)
    if (arr.getSize() <= maxBitonicSize)
    {
        auto &src = (_depth & 1) == 0 ? arr : aux;
        if (useShared && arr.getSize() <= memSize)
@@ -91,7 +92,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
        auto &src = (depth & 1) == 0 ? arr : aux;

        //small enough for for bitonic
        if (size <= blockDim.x * 2)
        if (size <= maxBitonicSize)
        {
            if (useShared && size <= memSize)
                externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem);
+10 −6
Original line number Diff line number Diff line
@@ -185,7 +185,7 @@ template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
                                      const Function &Cmp,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks,
                                      int elemInShared)
                                      int elemInShared, int maxBitonicSize)
{
    extern __shared__ int externMem[];
    Value *sharedMem = (Value *)externMem;
@@ -202,11 +202,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

    if (elemInShared == 0)
    {
        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0);
        singleBlockQuickSort<Value, Function, stackSize, false>
            (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
    }
    else
    {
        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
        singleBlockQuickSort<Value, Function, stackSize, true>
            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
    }
}

@@ -215,7 +217,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
                                      const Function &Cmp,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks2,
                                      int elemInShared)
                                      int elemInShared, int maxBitonicSize)
{
    extern __shared__ int externMem[];
    Value *sharedMem = (Value *)externMem;
@@ -237,11 +239,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

    if (elemInShared == 0)
    {
        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0);
        singleBlockQuickSort<Value, Function, stackSize, false>
            (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
    }
    else
    {
        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
        singleBlockQuickSort<Value, Function, stackSize, true>
            (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
    }
}