Skip to content
Snippets Groups Projects
quicksort.cuh 20 KiB
Newer Older
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#pragma once

#include <TNL/Containers/Array.h>
#include "../util/reduction.cuh"
#include "task.h"
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include "cudaPartition.cuh"
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include "quicksort_1Block.cuh"
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include "../bitonicSort/bitonicSort.h"
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include <iostream>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>

#define deb(x) std::cout << #x << " = " << x << std::endl;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
using namespace TNL;
using namespace TNL::Containers;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//-----------------------------------------------------------

__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
                             ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                             ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    int size = end - begin;
    if (size < 0)
    {
        printf("negative size, something went really wrong\n");
        return;
    }

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    if (size == 0)
        return;
    if (size <= maxElemFor2ndPhase)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
        int idx = atomicAdd(secondPhaseTasksCnt, 1);
        if (idx < secondPhaseTasks.getSize())
            secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
        else
            //printf("ran out of memory, trying backup\n");
            int idx = atomicAdd(newTasksCnt, 1);
            if (idx < newTasks.getSize())
                newTasks[idx] = TASK(begin, end, depth + 1);
            else
                printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n");
        }
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    }
    else
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        int idx = atomicAdd(newTasksCnt, 1);
        if (idx < newTasks.getSize())
            newTasks[idx] = TASK(begin, end, depth + 1);
            //printf("ran out of memory, trying backup\n");
            int idx = atomicAdd(secondPhaseTasksCnt, 1);
            if (idx < secondPhaseTasks.getSize())
                secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
            else
                printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n");
        }
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    }
}
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//----------------------------------------------------
template <typename Value, typename Function, bool useShared>
__global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
                                      const Function &Cmp, int elemPerBlock,
                                      ArrayView<TASK, Devices::Cuda> tasks,
                                      ArrayView<int, Devices::Cuda> taskMapping)
    extern __shared__ int externMem[];
    Value *sharedMem = (Value *)externMem;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    TASK &myTask = tasks[taskMapping[blockIdx.x]];
    auto &src = (myTask.depth & 1) == 0 ? arr : aux;
    auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
    Value pivot = src[myTask.pivotIdx];
    cudaPartition<Value, Function, useShared>(
        src.getView(myTask.partitionBegin, myTask.partitionEnd),
        dst.getView(myTask.partitionBegin, myTask.partitionEnd),
        Cmp, sharedMem, pivot,
        elemPerBlock, myTask);
}

//----------------------------------------------------

template <typename Value>
__global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, int maxElemFor2ndPhase,
                               ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
                               ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
    TASK &myTask = tasks[blockIdx.x];

    Value pivot = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin;
    int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
        /*
        #ifdef DEBUG
        aux[i] = -1;
        #endif
        */
        arr[i] = pivot;
    }
    if (threadIdx.x != 0)
        return;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    if (leftEnd - leftBegin > 0)
        writeNewTask(leftBegin, leftEnd, myTask.depth,
                     maxElemFor2ndPhase,
                     newTasks, newTasksCnt,
                     secondPhaseTasks, secondPhaseTasksCnt);
    if (rightEnd - rightBegin > 0)
                     myTask.depth, maxElemFor2ndPhase,
                     newTasks, newTasksCnt,
                     secondPhaseTasks, secondPhaseTasksCnt);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//-----------------------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                                      const Function &Cmp,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks,
                                      int elemInShared)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    extern __shared__ int externMem[];
    Value *sharedMem = (Value *)externMem;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    TASK &myTask = secondPhaseTasks[blockIdx.x];
    if (myTask.partitionEnd - myTask.partitionBegin <= 0)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
    auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);

        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}
template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
                                      const Function &Cmp,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
                                      ArrayView<TASK, Devices::Cuda> secondPhaseTasks2,
                                      int elemInShared)
    extern __shared__ int externMem[];
    Value *sharedMem = (Value *)externMem;
    if (blockIdx.x < secondPhaseTasks1.getSize())
        myTask = secondPhaseTasks1[blockIdx.x];
    else
        myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()];

    if (myTask.partitionEnd - myTask.partitionBegin <= 0)
    {
        printf("empty task???\n");

    auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
    auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);

        singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
        singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//-----------------------------------------------------------

__global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
                                     ArrayView<int, Devices::Cuda> blocksNeeded)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= cuda_tasks.getSize())
        return;

    auto task = cuda_tasks[i];
    int size = task.partitionEnd - task.partitionBegin;
    blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0);
template <typename Value, typename Function>
__global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
                             ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
                             ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
                             ArrayView<Value, Devices::Cuda> src, const Function &Cmp)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    if (blockIdx.x >= cuda_tasks.getSize())
    int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1];
    int end = cuda_reductionTaskInitMem[blockIdx.x];
    for (int i = start + threadIdx.x; i < end; i += blockDim.x)
        cuda_blockToTaskMapping[i] = blockIdx.x;
    if (threadIdx.x == 0)
        TASK &task = cuda_tasks[blockIdx.x];
        int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp);
        task.initTask(start, end - start, pivotIdx);
//-----------------------------------------------------------
//-----------------------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

template <typename Value>
class QUICKSORT
{
    ArrayView<Value, Devices::Cuda> arr;
    Array<Value, Devices::Cuda> aux;

    int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable;

    const int maxBitonicSize = threadsPerBlock * 2;
    const int desired_2ndPhasElemPerBlock = maxBitonicSize;
    const int g_maxTasks = 1 << 14;

    int maxTasks;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    Array<TASK, Devices::Cuda> cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks;
    Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each
    int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount
    int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    Array<int, Devices::Cuda> cuda_blockToTaskMapping;
    Array<int, Devices::Cuda> cuda_reductionTaskInitMem;
    int iteration = 0;
    //--------------------------------------------------------------------------------------
    //--------------------------------------------------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
public:
    QUICKSORT(ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable)
        : arr(arr.getView()), aux(arr.getSize()),
          maxBlocks(gridDim), threadsPerBlock(blockDim),
          desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable),

          maxTasks(min(arr.getSize(), g_maxTasks)),
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
          cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks),
          cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1),

          cuda_blockToTaskMapping(maxBlocks),
          cuda_reductionTaskInitMem(maxTasks)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
        cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0));
        host_1stPhaseTasksAmount = 1;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        host_2ndPhaseTasksAmount = 0;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        cuda_2ndPhaseTasksAmount = 0;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        iteration = 0;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        TNL_CHECK_CUDA_DEVICE;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    template <typename Function>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    void sort(const Function &Cmp);

    template <typename Function>
    void firstPhase(const Function &Cmp);

    template <typename Function>
    void secondPhase(const Function &Cmp);

    int getSetsNeeded(int elemPerBlock) const;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    int getElemPerBlock() const;

    /**
     * returns the amount of blocks needed
     * */
    int initTasks(int elemPerBlock, const Function &Cmp);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    void processNewTasks();
//---------------------------------------------------------------------------------------------

template <typename Value>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Function>
void QUICKSORT<Value>::sort(const Function &Cmp)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    firstPhase(Cmp);

    int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
    if (total2ndPhase > 0)
        secondPhase(Cmp);

    cudaDeviceSynchronize();
    TNL_CHECK_CUDA_DEVICE;
    return;
}

//---------------------------------------------------------------------------------------------

template <typename Value>
template <typename Function>
void QUICKSORT<Value>::firstPhase(const Function &Cmp)
{
    while (host_1stPhaseTasksAmount > 0)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
        //2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array
        if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks)
            break;
        //just in case newly created tasks wouldnt fit
        if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        int elemPerBlock = getElemPerBlock();
        int blocksCnt = initTasks(elemPerBlock, Cmp);
        TNL_CHECK_CUDA_DEVICE;

        if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            break;

        //-----------------------------------------------
        //do the partitioning
        auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
        int externMemByteSize = elemPerBlock * sizeof(Value);
        /**
         * check if can partition using shared memory for coalesced read and write
         * 1st phase of partitioning
         * sets of blocks work on a task
         * 
         * using the atomicAdd intristic, each block reserves a chunk of memory where to move elements
         * smaller and bigger than pivot move to
         * */
        if (externMemByteSize <= maxSharable)
            cudaQuickSort1stPhase<Value, Function, true>
                <<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
                    arr, aux, Cmp, elemPerBlock,
                    task, cuda_blockToTaskMapping);
        }
        else
        {
            cudaQuickSort1stPhase<Value, Function, false>
                <<<blocksCnt, threadsPerBlock, 0>>>(
                    arr, aux, Cmp, elemPerBlock,
                    task, cuda_blockToTaskMapping);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        TNL_CHECK_CUDA_DEVICE;
        /**
         * fill in the gap between smaller and bigger with elements == pivot
         * after writing also create new tasks, each task generates at max 2 tasks
         * 
         * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase
         * bigger need more blocks to partition and are written into newTask
         * with iteration %2, rotate between the 2 tasks array to save from copying
         * */
        auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks;
        cudaWritePivot<Value>
            <<<host_1stPhaseTasksAmount, 1024>>>(
                arr, aux, desired_2ndPhasElemPerBlock,
                task, newTask, cuda_newTasksAmount.getData(),
                cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        TNL_CHECK_CUDA_DEVICE;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        processNewTasks();
        iteration++;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    }
}

//----------------------------------------------------------------------

template <typename Value>
template <typename Function>
void QUICKSORT<Value>::secondPhase(const Function &Cmp)
{
    int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
    const int stackSize = 32;
    auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;

    int elemInShared = desiredElemPerBlock;
    int externSharedByteSize = sizeof(Value) * elemInShared;
    if (externSharedByteSize > maxSharable)
    {
        externSharedByteSize = 0;
        elemInShared = 0;
    }

    if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared);
    }
    else if (host_1stPhaseTasksAmount > 0)
    {
        auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared);
    }
    else
    {
        auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
        cudaQuickSort2ndPhase<Value, Function, stackSize>
            <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared);
//----------------------------------------------------------------------

template <typename Value>
int QUICKSORT<Value>::getSetsNeeded(int elemPerBlock) const
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
    auto fetch = [=] __cuda_callable__(int i) {
        const auto &task = view[i];
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        int size = task.partitionEnd - task.partitionBegin;
        return size / elemPerBlock + (size % elemPerBlock != 0);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    };
    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
    return Algorithms::Reduction<Devices::Cuda>::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}

template <typename Value>
int QUICKSORT<Value>::getElemPerBlock() const
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    int setsNeeded = getSetsNeeded(desiredElemPerBlock);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    if (setsNeeded <= maxBlocks)
        return desiredElemPerBlock;

    //want multiplier*minElemPerBLock <= x*threadPerBlock
    //find smallest x so that this inequality holds
    double multiplier = 1. * setsNeeded / maxBlocks;
    int elemPerBlock = multiplier * desiredElemPerBlock;
    setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    return setsNeeded * threadsPerBlock;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}

template <typename Value>
int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function &Cmp)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    int threads = min(host_1stPhaseTasksAmount, threadsPerBlock);
    int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    auto src = iteration % 2 == 0 ? arr : aux.getView();
    auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;

    //[i] == how many blocks task i needs
    cudaCalcBlocksNeeded<<<threads, blocks>>>(tasks.getView(0, host_1stPhaseTasksAmount),
                                              elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));

    thrust::inclusive_scan(thrust::device,
                           cuda_reductionTaskInitMem.getData(),
                           cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount,
                           cuda_reductionTaskInitMem.getData());
    int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
    //need too many blocks, give back control
    if (blocksNeeded >= cuda_blockToTaskMapping.getSize())
        return blocksNeeded;

    cudaInitTask<<<host_1stPhaseTasksAmount, 512>>>(
        tasks.getView(0, host_1stPhaseTasksAmount),
        cuda_blockToTaskMapping.getView(0, blocksNeeded),
        cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount),
        src, Cmp);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    cuda_newTasksAmount.setElement(0, 0);
    return blocksNeeded;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

template <typename Value>
void QUICKSORT<Value>::processNewTasks()
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks);
    host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
}

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//-----------------------------------------------------------
//-----------------------------------------------------------
//-----------------------------------------------------------

template <typename Value, typename Function>
void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp)
    const int maxBlocks = (1 << 20);

    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved
    int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve;

    //blockDim*multiplier*sizeof(Value) <= maxSharable

    int blockDim = 512; //best case
    int elemPerBlock = maxSharable / sizeof(Value);
    const int maxMultiplier = 8;
    int multiplier = min(elemPerBlock / blockDim, maxMultiplier);
    if (multiplier <= 0)
    {
        blockDim = 256;
        multiplier = min(elemPerBlock / blockDim, maxMultiplier);
        if (multiplier <= 0)
        {
            //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value
            //sort has to be done with the use of global memory alone

            QUICKSORT<Value> sorter(arr, maxBlocks, 512, 0, maxSharable);
            sorter.sort(Cmp);
            return;
        }
    }

    assert(blockDim * multiplier * sizeof(Value) <= maxSharable);

    QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable);
template <typename Value>
void quicksort(ArrayView<Value, Devices::Cuda> arr)
    quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });