#pragma once #include <TNL/Containers/Array.h> #include "../util/reduction.cuh" #include "task.h" #include "cudaPartition.cuh" #include "quicksort_1Block.cuh" #include "../bitonicSort/bitonicSort.h" #include <iostream> #include <thrust/scan.h> #include <thrust/execution_policy.h> #define deb(x) std::cout << #x << " = " << x << std::endl; using namespace TNL; using namespace TNL::Containers; //----------------------------------------------------------- __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) { int size = end - begin; if (size < 0) { printf("negative size, something went really wrong\n"); return; } if (size == 0) return; if (size <= maxElemFor2ndPhase) { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) secondPhaseTasks[idx] = TASK(begin, end, depth + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) newTasks[idx] = TASK(begin, end, depth + 1); else printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n"); } } else { int idx = atomicAdd(newTasksCnt, 1); if (idx < newTasks.getSize()) newTasks[idx] = TASK(begin, end, depth + 1); else { //printf("ran out of memory, trying backup\n"); int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) secondPhaseTasks[idx] = TASK(begin, end, depth + 1); else printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n"); } } } //---------------------------------------------------- template <typename Value, typename Function, bool useShared> __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, const Function &Cmp, int elemPerBlock, ArrayView<TASK, Devices::Cuda> tasks, ArrayView<int, Devices::Cuda> taskMapping) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; TASK &myTask = tasks[taskMapping[blockIdx.x]]; auto &src = (myTask.depth & 1) == 0 ? arr : aux; auto &dst = (myTask.depth & 1) == 0 ? aux : arr; Value pivot = src[myTask.pivotIdx]; cudaPartition<Value, Function, useShared>( src.getView(myTask.partitionBegin, myTask.partitionEnd), dst.getView(myTask.partitionBegin, myTask.partitionEnd), Cmp, sharedMem, pivot, elemPerBlock, myTask); } //---------------------------------------------------- template <typename Value> __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) { TASK &myTask = tasks[blockIdx.x]; Value pivot = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx]; int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin; int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd; for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x) { /* #ifdef DEBUG aux[i] = -1; #endif */ arr[i] = pivot; } if (threadIdx.x != 0) return; if (leftEnd - leftBegin > 0) { writeNewTask(leftBegin, leftEnd, myTask.depth, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } if (rightEnd - rightBegin > 0) { writeNewTask(rightBegin, rightEnd, myTask.depth, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } } //----------------------------------------------------------- template <typename Value, typename Function, int stackSize> __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int elemInShared) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; TASK &myTask = secondPhaseTasks[blockIdx.x]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) return; auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } } template <typename Value, typename Function, int stackSize> __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks1, ArrayView<TASK, Devices::Cuda> secondPhaseTasks2, int elemInShared) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; TASK myTask; if (blockIdx.x < secondPhaseTasks1.getSize()) myTask = secondPhaseTasks1[blockIdx.x]; else myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()]; if (myTask.partitionEnd - myTask.partitionBegin <= 0) { printf("empty task???\n"); return; } auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd); auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd); if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); } } //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock, ArrayView<int, Devices::Cuda> blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= cuda_tasks.getSize()) return; auto task = cuda_tasks[i]; int size = task.partitionEnd - task.partitionBegin; blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0); } template <typename Value, typename Function> __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks, ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping, ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem, ArrayView<Value, Devices::Cuda> src, const Function &Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) return; int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1]; int end = cuda_reductionTaskInitMem[blockIdx.x]; for (int i = start + threadIdx.x; i < end; i += blockDim.x) cuda_blockToTaskMapping[i] = blockIdx.x; if (threadIdx.x == 0) { TASK &task = cuda_tasks[blockIdx.x]; int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp); task.initTask(start, end - start, pivotIdx); } } //----------------------------------------------------------- //----------------------------------------------------------- template <typename Value> class QUICKSORT { ArrayView<Value, Devices::Cuda> arr; Array<Value, Devices::Cuda> aux; int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable; const int maxBitonicSize = threadsPerBlock * 2; const int desired_2ndPhasElemPerBlock = maxBitonicSize; const int g_maxTasks = 1 << 14; int maxTasks; Array<TASK, Devices::Cuda> cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks; Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount Array<int, Devices::Cuda> cuda_blockToTaskMapping; Array<int, Devices::Cuda> cuda_reductionTaskInitMem; int iteration = 0; //-------------------------------------------------------------------------------------- //-------------------------------------------------------------------------------------- public: QUICKSORT(ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable) : arr(arr.getView()), aux(arr.getSize()), maxBlocks(gridDim), threadsPerBlock(blockDim), desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable), maxTasks(min(arr.getSize(), g_maxTasks)), cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks), cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1), cuda_blockToTaskMapping(maxBlocks), cuda_reductionTaskInitMem(maxTasks) { cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0)); host_1stPhaseTasksAmount = 1; host_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; TNL_CHECK_CUDA_DEVICE; } template <typename Function> void sort(const Function &Cmp); template <typename Function> void firstPhase(const Function &Cmp); template <typename Function> void secondPhase(const Function &Cmp); int getSetsNeeded(int elemPerBlock) const; int getElemPerBlock() const; /** * returns the amount of blocks needed * */ template <typename Function> int initTasks(int elemPerBlock, const Function &Cmp); void processNewTasks(); }; //--------------------------------------------------------------------------------------------- template <typename Value> template <typename Function> void QUICKSORT<Value>::sort(const Function &Cmp) { firstPhase(Cmp); int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; if (total2ndPhase > 0) secondPhase(Cmp); cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; return; } //--------------------------------------------------------------------------------------------- template <typename Value> template <typename Function> void QUICKSORT<Value>::firstPhase(const Function &Cmp) { while (host_1stPhaseTasksAmount > 0) { //2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) break; //just in case newly created tasks wouldnt fit if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) break; int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock, Cmp); TNL_CHECK_CUDA_DEVICE; if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase break; //----------------------------------------------- //do the partitioning auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int externMemByteSize = elemPerBlock * sizeof(Value); /** * check if can partition using shared memory for coalesced read and write * 1st phase of partitioning * sets of blocks work on a task * * using the atomicAdd intristic, each block reserves a chunk of memory where to move elements * smaller and bigger than pivot move to * */ if (externMemByteSize <= maxSharable) { cudaQuickSort1stPhase<Value, Function, true> <<<blocksCnt, threadsPerBlock, externMemByteSize>>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } else { cudaQuickSort1stPhase<Value, Function, false> <<<blocksCnt, threadsPerBlock, 0>>>( arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); } TNL_CHECK_CUDA_DEVICE; /** * fill in the gap between smaller and bigger with elements == pivot * after writing also create new tasks, each task generates at max 2 tasks * * tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase * bigger need more blocks to partition and are written into newTask * with iteration %2, rotate between the 2 tasks array to save from copying * */ auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks; cudaWritePivot<Value> <<<host_1stPhaseTasksAmount, 1024>>>( arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); TNL_CHECK_CUDA_DEVICE; processNewTasks(); iteration++; } } //---------------------------------------------------------------------- template <typename Value> template <typename Function> void QUICKSORT<Value>::secondPhase(const Function &Cmp) { int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount; const int stackSize = 32; auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; int elemInShared = desiredElemPerBlock; int externSharedByteSize = sizeof(Value) * elemInShared; if (externSharedByteSize > maxSharable) { externSharedByteSize = 0; elemInShared = 0; } if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared); } } //---------------------------------------------------------------------- template <typename Value> int QUICKSORT<Value>::getSetsNeeded(int elemPerBlock) const { auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { const auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::Reduction<Devices::Cuda>::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0); } template <typename Value> int QUICKSORT<Value>::getElemPerBlock() const { int setsNeeded = getSetsNeeded(desiredElemPerBlock); if (setsNeeded <= maxBlocks) return desiredElemPerBlock; //want multiplier*minElemPerBLock <= x*threadPerBlock //find smallest x so that this inequality holds double multiplier = 1. * setsNeeded / maxBlocks; int elemPerBlock = multiplier * desiredElemPerBlock; setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0); return setsNeeded * threadsPerBlock; } template <typename Value> template <typename Function> int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function &Cmp) { int threads = min(host_1stPhaseTasksAmount, threadsPerBlock); int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0); auto src = iteration % 2 == 0 ? arr : aux.getView(); auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks; //[i] == how many blocks task i needs cudaCalcBlocksNeeded<<<threads, blocks>>>(tasks.getView(0, host_1stPhaseTasksAmount), elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); thrust::inclusive_scan(thrust::device, cuda_reductionTaskInitMem.getData(), cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount, cuda_reductionTaskInitMem.getData()); int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); //need too many blocks, give back control if (blocksNeeded >= cuda_blockToTaskMapping.getSize()) return blocksNeeded; cudaInitTask<<<host_1stPhaseTasksAmount, 512>>>( tasks.getView(0, host_1stPhaseTasksAmount), cuda_blockToTaskMapping.getView(0, blocksNeeded), cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount), src, Cmp); cuda_newTasksAmount.setElement(0, 0); return blocksNeeded; } template <typename Value> void QUICKSORT<Value>::processNewTasks() { host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks); host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks); } //----------------------------------------------------------- //----------------------------------------------------------- //----------------------------------------------------------- template <typename Value, typename Function> void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp) { const int maxBlocks = (1 << 20); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, 0); int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve; //blockDim*multiplier*sizeof(Value) <= maxSharable int blockDim = 512; //best case int elemPerBlock = maxSharable / sizeof(Value); const int maxMultiplier = 8; int multiplier = min(elemPerBlock / blockDim, maxMultiplier); if (multiplier <= 0) { blockDim = 256; multiplier = min(elemPerBlock / blockDim, maxMultiplier); if (multiplier <= 0) { //worst case scenario, shared memory cant be utilized at all because of the sheer size of Value //sort has to be done with the use of global memory alone QUICKSORT<Value> sorter(arr, maxBlocks, 512, 0, maxSharable); sorter.sort(Cmp); return; } } assert(blockDim * multiplier * sizeof(Value) <= maxSharable); QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable); sorter.sort(Cmp); } template <typename Value> void quicksort(ArrayView<Value, Devices::Cuda> arr) { quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); }