Newer
Older
#include <thrust/scan.h>
#include <thrust/execution_policy.h>
#define deb(x) std::cout << #x << " = " << x << std::endl;
using namespace TNL;
using namespace TNL::Containers;
//-----------------------------------------------------------
__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
printf("negative size, something went really wrong\n");
return;
}
if (size <= maxElemFor2ndPhase)
secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
//printf("ran out of memory, trying backup\n");
int idx = atomicAdd(newTasksCnt, 1);
if (idx < newTasks.getSize())
newTasks[idx] = TASK(begin, end, depth + 1);
else
printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n");
}
newTasks[idx] = TASK(begin, end, depth + 1);
//printf("ran out of memory, trying backup\n");
secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
else
printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n");
}
//----------------------------------------------------
template <typename Value, typename Function, bool useShared>
__global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
const Function &Cmp, int elemPerBlock,
ArrayView<TASK, Devices::Cuda> tasks,
ArrayView<int, Devices::Cuda> taskMapping)
extern __shared__ int externMem[];
auto &src = (myTask.depth & 1) == 0 ? arr : aux;
auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
cudaPartition<Value, Function, useShared>(
src.getView(myTask.partitionBegin, myTask.partitionEnd),
dst.getView(myTask.partitionBegin, myTask.partitionEnd),
Cmp, sharedMem, pivot,
elemPerBlock, myTask);
}
//----------------------------------------------------
__global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, int maxElemFor2ndPhase,
ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
TASK &myTask = tasks[blockIdx.x];
Value pivot = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
int leftBegin = myTask.partitionBegin, leftEnd = myTask.partitionBegin + myTask.dstBegin;
int rightBegin = myTask.partitionBegin + myTask.dstEnd, rightEnd = myTask.partitionEnd;
for (int i = leftEnd + threadIdx.x; i < rightBegin; i += blockDim.x)
{
/*
#ifdef DEBUG
aux[i] = -1;
#endif
*/
arr[i] = pivot;
}
writeNewTask(leftBegin, leftEnd, myTask.depth,
maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
writeNewTask(rightBegin, rightEnd,
myTask.depth, maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
//-----------------------------------------------------------
template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks,
int elemInShared)
extern __shared__ int externMem[];
Value *sharedMem = (Value *)externMem;
if (myTask.partitionEnd - myTask.partitionBegin <= 0)
auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
if (elemInShared == 0)
singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
const Function &Cmp,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks2,
int elemInShared)
extern __shared__ int externMem[];
Value *sharedMem = (Value *)externMem;
if (blockIdx.x < secondPhaseTasks1.getSize())
myTask = secondPhaseTasks1[blockIdx.x];
else
myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()];
if (myTask.partitionEnd - myTask.partitionBegin <= 0)
auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);
if (elemInShared == 0)
singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
__global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
ArrayView<int, Devices::Cuda> blocksNeeded)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
return;
auto task = cuda_tasks[i];
int size = task.partitionEnd - task.partitionBegin;
blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0);
template <typename Value, typename Function>
__global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
ArrayView<Value, Devices::Cuda> src, const Function &Cmp)
if (blockIdx.x >= cuda_tasks.getSize())
int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1];
int end = cuda_reductionTaskInitMem[blockIdx.x];
for (int i = start + threadIdx.x; i < end; i += blockDim.x)
cuda_blockToTaskMapping[i] = blockIdx.x;
int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp);
task.initTask(start, end - start, pivotIdx);
//-----------------------------------------------------------
//-----------------------------------------------------------
ArrayView<Value, Devices::Cuda> arr;
Array<Value, Devices::Cuda> aux;
int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable;
const int maxBitonicSize = threadsPerBlock * 2;
const int desired_2ndPhasElemPerBlock = maxBitonicSize;
const int g_maxTasks = 1 << 14;
int maxTasks;
Array<TASK, Devices::Cuda> cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks;
Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each
int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount
int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount
Array<int, Devices::Cuda> cuda_reductionTaskInitMem;
//--------------------------------------------------------------------------------------
//--------------------------------------------------------------------------------------
QUICKSORT(ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable)
: arr(arr.getView()), aux(arr.getSize()),
maxBlocks(gridDim), threadsPerBlock(blockDim),
desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable),
maxTasks(min(arr.getSize(), g_maxTasks)),
cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks),
cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1),
cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0));
template <typename Function>
void firstPhase(const Function &Cmp);
template <typename Function>
void secondPhase(const Function &Cmp);
int getSetsNeeded(int elemPerBlock) const;
int getElemPerBlock() const;
/**
* returns the amount of blocks needed
* */
template <typename Function>
int initTasks(int elemPerBlock, const Function &Cmp);
//---------------------------------------------------------------------------------------------
void QUICKSORT<Value>::sort(const Function &Cmp)
firstPhase(Cmp);
int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
if (total2ndPhase > 0)
secondPhase(Cmp);
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
return;
}
//---------------------------------------------------------------------------------------------
template <typename Value>
template <typename Function>
void QUICKSORT<Value>::firstPhase(const Function &Cmp)
{
while (host_1stPhaseTasksAmount > 0)
//2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array
if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks)
//just in case newly created tasks wouldnt fit
if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
int blocksCnt = initTasks(elemPerBlock, Cmp);
TNL_CHECK_CUDA_DEVICE;
if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase
//-----------------------------------------------
//do the partitioning
auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
int externMemByteSize = elemPerBlock * sizeof(Value);
/**
* check if can partition using shared memory for coalesced read and write
* 1st phase of partitioning
* sets of blocks work on a task
*
* using the atomicAdd intristic, each block reserves a chunk of memory where to move elements
* smaller and bigger than pivot move to
* */
if (externMemByteSize <= maxSharable)
cudaQuickSort1stPhase<Value, Function, true>
<<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
arr, aux, Cmp, elemPerBlock,
task, cuda_blockToTaskMapping);
}
else
{
cudaQuickSort1stPhase<Value, Function, false>
<<<blocksCnt, threadsPerBlock, 0>>>(
/**
* fill in the gap between smaller and bigger with elements == pivot
* after writing also create new tasks, each task generates at max 2 tasks
*
* tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase
* bigger need more blocks to partition and are written into newTask
* with iteration %2, rotate between the 2 tasks array to save from copying
* */
auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks;
<<<host_1stPhaseTasksAmount, 1024>>>(
arr, aux, desired_2ndPhasElemPerBlock,
task, newTask, cuda_newTasksAmount.getData(),
cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
}
//----------------------------------------------------------------------
template <typename Value>
template <typename Function>
void QUICKSORT<Value>::secondPhase(const Function &Cmp)
{
int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
const int stackSize = 32;
auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
int elemInShared = desiredElemPerBlock;
int externSharedByteSize = sizeof(Value) * elemInShared;
if (externSharedByteSize > maxSharable)
{
externSharedByteSize = 0;
elemInShared = 0;
}
if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0)
auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared);
}
else if (host_1stPhaseTasksAmount > 0)
{
auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared);
}
else
{
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared);
//----------------------------------------------------------------------
template <typename Value>
int QUICKSORT<Value>::getSetsNeeded(int elemPerBlock) const
{
auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
auto fetch = [=] __cuda_callable__(int i) {
return size / elemPerBlock + (size % elemPerBlock != 0);
};
auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
return Algorithms::Reduction<Devices::Cuda>::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0);
template <typename Value>
int QUICKSORT<Value>::getElemPerBlock() const
int setsNeeded = getSetsNeeded(desiredElemPerBlock);
return desiredElemPerBlock;
//want multiplier*minElemPerBLock <= x*threadPerBlock
//find smallest x so that this inequality holds
double multiplier = 1. * setsNeeded / maxBlocks;
int elemPerBlock = multiplier * desiredElemPerBlock;
setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0);
template <typename Function>
int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function &Cmp)
int threads = min(host_1stPhaseTasksAmount, threadsPerBlock);
int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0);
auto src = iteration % 2 == 0 ? arr : aux.getView();
auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
//[i] == how many blocks task i needs
cudaCalcBlocksNeeded<<<threads, blocks>>>(tasks.getView(0, host_1stPhaseTasksAmount),
elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
thrust::inclusive_scan(thrust::device,
cuda_reductionTaskInitMem.getData(),
cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount,
cuda_reductionTaskInitMem.getData());
int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
if (blocksNeeded >= cuda_blockToTaskMapping.getSize())
cudaInitTask<<<host_1stPhaseTasksAmount, 512>>>(
tasks.getView(0, host_1stPhaseTasksAmount),
cuda_blockToTaskMapping.getView(0, blocksNeeded),
cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount),
src, Cmp);
template <typename Value>
void QUICKSORT<Value>::processNewTasks()
host_1stPhaseTasksAmount = min(cuda_newTasksAmount.getElement(0), maxTasks);
host_2ndPhaseTasksAmount = min(cuda_2ndPhaseTasksAmount.getElement(0), maxTasks);
//-----------------------------------------------------------
//-----------------------------------------------------------
//-----------------------------------------------------------
template <typename Value, typename Function>
void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp)
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
const int maxBlocks = (1 << 20);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved
int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve;
//blockDim*multiplier*sizeof(Value) <= maxSharable
int blockDim = 512; //best case
int elemPerBlock = maxSharable / sizeof(Value);
const int maxMultiplier = 8;
int multiplier = min(elemPerBlock / blockDim, maxMultiplier);
if (multiplier <= 0)
{
blockDim = 256;
multiplier = min(elemPerBlock / blockDim, maxMultiplier);
if (multiplier <= 0)
{
//worst case scenario, shared memory cant be utilized at all because of the sheer size of Value
//sort has to be done with the use of global memory alone
QUICKSORT<Value> sorter(arr, maxBlocks, 512, 0, maxSharable);
sorter.sort(Cmp);
return;
}
}
assert(blockDim * multiplier * sizeof(Value) <= maxSharable);
QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier * blockDim, maxSharable);
template <typename Value>
void quicksort(ArrayView<Value, Devices::Cuda> arr)
quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });