Skip to content
Snippets Groups Projects
Commit 070391be authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

calc sharedMem for 1st phase

parent 7e72c87e
No related branches found
No related tags found
No related merge requests found
......@@ -19,11 +19,11 @@ using namespace TNL::Containers;
//-----------------------------------------------------------
__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
{
int size = end - begin;
if(size < 0)
if (size < 0)
{
printf("negative size, something went really wrong\n");
return;
......@@ -68,18 +68,18 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
template <typename Value, typename Function>
__global__ void cudaQuickSort1stPhase_1(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
const Function &Cmp, int elemPerBlock,
ArrayView<TASK, Devices::Cuda> tasks,
ArrayView<int, Devices::Cuda> taskMapping)
const Function &Cmp, int elemPerBlock,
ArrayView<TASK, Devices::Cuda> tasks,
ArrayView<int, Devices::Cuda> taskMapping)
{
extern __shared__ int externMem[];
Value *sharedMem = (Value*)externMem;
Value *sharedMem = (Value *)externMem;
static __shared__ Value pivot;
TASK &myTask = tasks[taskMapping[blockIdx.x]];
auto & src = (myTask.depth & 1) == 0? arr : aux;
auto & dst = (myTask.depth & 1) == 0? aux : arr;
auto &src = (myTask.depth & 1) == 0 ? arr : aux;
auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
if (threadIdx.x == 0)
pivot = src[myTask.pivotIdx];
......@@ -94,15 +94,15 @@ __global__ void cudaQuickSort1stPhase_1(ArrayView<Value, Devices::Cuda> arr, Arr
template <typename Value, typename Function>
__global__ void cudaQuickSort1stPhase_2(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
const Function &Cmp, int elemPerBlock,
ArrayView<TASK, Devices::Cuda> tasks,
ArrayView<int, Devices::Cuda> taskMapping)
const Function &Cmp, int elemPerBlock,
ArrayView<TASK, Devices::Cuda> tasks,
ArrayView<int, Devices::Cuda> taskMapping)
{
static __shared__ Value pivot;
TASK &myTask = tasks[taskMapping[blockIdx.x]];
auto & src = (myTask.depth & 1) == 0? arr : aux;
auto & dst = (myTask.depth & 1) == 0? aux : arr;
auto &src = (myTask.depth & 1) == 0 ? arr : aux;
auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
if (threadIdx.x == 0)
pivot = src[myTask.pivotIdx];
......@@ -116,7 +116,7 @@ __global__ void cudaQuickSort1stPhase_2(ArrayView<Value, Devices::Cuda> arr, Arr
//----------------------------------------------------
template<typename Value>
template <typename Value>
__global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, int maxElemFor2ndPhase,
ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
......@@ -149,20 +149,20 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
if (threadIdx.x != 0)
return;
if(leftEnd - leftBegin > 0)
if (leftEnd - leftBegin > 0)
{
writeNewTask(leftBegin, leftEnd, myTask.depth,
maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
}
if(rightEnd - rightBegin > 0)
if (rightEnd - rightBegin > 0)
{
writeNewTask(rightBegin, rightEnd,
myTask.depth, maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
myTask.depth, maxElemFor2ndPhase,
newTasks, newTasksCnt,
secondPhaseTasks, secondPhaseTasksCnt);
}
}
......@@ -174,7 +174,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
ArrayView<TASK, Devices::Cuda> secondPhaseTasks)
{
TASK &myTask = secondPhaseTasks[blockIdx.x];
if(myTask.partitionEnd - myTask.partitionBegin <= 0 )
if (myTask.partitionEnd - myTask.partitionBegin <= 0)
return;
auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
......@@ -183,7 +183,6 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
singleBlockQuickSort<Value, Function, stackSize>(arrView, auxView, Cmp, myTask.depth);
}
template <typename Value, typename Function, int stackSize>
__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
const Function &Cmp,
......@@ -191,12 +190,12 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
ArrayView<TASK, Devices::Cuda> secondPhaseTasks2)
{
TASK myTask;
if(blockIdx.x < secondPhaseTasks1.getSize())
if (blockIdx.x < secondPhaseTasks1.getSize())
myTask = secondPhaseTasks1[blockIdx.x];
else
myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()];
if(myTask.partitionEnd - myTask.partitionBegin <= 0 )
if (myTask.partitionEnd - myTask.partitionBegin <= 0)
return;
auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
......@@ -208,16 +207,16 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
//-----------------------------------------------------------
__global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
ArrayView<int, Devices::Cuda> blocksNeeded)
ArrayView<int, Devices::Cuda> blocksNeeded)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i >= cuda_tasks.getSize())
if (i >= cuda_tasks.getSize())
return;
auto task = cuda_tasks[i];
int size = task.partitionEnd - task.partitionBegin;
blocksNeeded[i] = size / elemPerBlock + (size % elemPerBlock != 0);
}
}
template <typename Value, typename Function>
__global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
......@@ -225,40 +224,42 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
ArrayView<Value, Devices::Cuda> src, const Function &Cmp)
{
if(blockIdx.x >= cuda_tasks.getSize())
if (blockIdx.x >= cuda_tasks.getSize())
return;
int start = blockIdx.x == 0? 0 : cuda_reductionTaskInitMem[blockIdx.x -1];
int start = blockIdx.x == 0 ? 0 : cuda_reductionTaskInitMem[blockIdx.x - 1];
int end = cuda_reductionTaskInitMem[blockIdx.x];
for(int i = start + threadIdx.x; i < end; i += blockDim.x)
for (int i = start + threadIdx.x; i < end; i += blockDim.x)
cuda_blockToTaskMapping[i] = blockIdx.x;
if(threadIdx.x == 0)
if (threadIdx.x == 0)
{
TASK & task = cuda_tasks[blockIdx.x];
TASK &task = cuda_tasks[blockIdx.x];
int pivotIdx = task.partitionBegin + pickPivotIdx(src.getView(task.partitionBegin, task.partitionEnd), Cmp);
task.initTask(start, end-start, pivotIdx);
task.initTask(start, end - start, pivotIdx);
}
}
//-----------------------------------------------------------
//-----------------------------------------------------------
const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k
const int g_maxTasks = 1 << 14;
const int minElemPerBlock = threadsPerBlock*10;
const int maxBitonicSize = threadsPerBlock*2;
const int desired_2ndPhasElemPerBlock = maxBitonicSize;
template<typename Value>
template <typename Value>
class QUICKSORT
{
ArrayView<Value, Devices::Cuda> arr;
Array<Value, Devices::Cuda> aux;
int maxTasks, maxBlocks;
int maxBlocks, threadsPerBlock, desiredElemPerBlock, maxSharable;
const int maxBitonicSize = threadsPerBlock * 2;
const int desired_2ndPhasElemPerBlock = maxBitonicSize;
const int g_maxTasks = 1 << 14;
int maxTasks;
Array<TASK, Devices::Cuda> cuda_tasks, cuda_newTasks, cuda_2ndPhaseTasks;
Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer
Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each
int tasksAmount; //counter for Host == cuda_newTasksAmount
int host_1stPhaseTasksAmount; //counter for Host == cuda_newTasksAmount
int host_2ndPhaseTasksAmount; // cuda_2ndPhaseTasksAmount
Array<int, Devices::Cuda> cuda_blockToTaskMapping;
......@@ -266,32 +267,40 @@ class QUICKSORT
int iteration = 0;
//--------------------------------------------------------------------------------------
cudaDeviceProp deviceProp;
//--------------------------------------------------------------------------------------
public:
QUICKSORT(ArrayView<Value, Devices::Cuda> _arr)
: arr(_arr), aux(arr.getSize()),
QUICKSORT(ArrayView<Value, Devices::Cuda> arr, int gridDim, int blockDim, int desiredElemPerBlock, int maxSharable)
: arr(arr.getView()), aux(arr.getSize()),
maxBlocks(gridDim), threadsPerBlock(blockDim),
desiredElemPerBlock(desiredElemPerBlock), maxSharable(maxSharable),
maxTasks(min(arr.getSize(), g_maxTasks)),
maxBlocks(g_maxBlocks),
cuda_tasks(maxTasks), cuda_newTasks(maxTasks), cuda_2ndPhaseTasks(maxTasks),
cuda_newTasksAmount(1),
cuda_2ndPhaseTasksAmount(1),
cuda_newTasksAmount(1), cuda_2ndPhaseTasksAmount(1),
cuda_blockToTaskMapping(maxBlocks),
cuda_reductionTaskInitMem(maxTasks)
{
cuda_tasks.setElement(0, TASK(0, arr.getSize(), 0));
tasksAmount = 1;
host_1stPhaseTasksAmount = 1;
host_2ndPhaseTasksAmount = 0;
cuda_2ndPhaseTasksAmount = 0;
iteration = 0;
cudaGetDeviceProperties(&deviceProp, 0); //change device
TNL_CHECK_CUDA_DEVICE;
}
template <typename Function>
void sort(const Function &Cmp);
template <typename Function>
void firstPhase(const Function &Cmp);
template <typename Function>
void secondPhase(const Function &Cmp);
int getSetsNeeded(int elemPerBlock) const;
int getElemPerBlock() const;
......@@ -299,41 +308,66 @@ public:
* returns the amount of blocks needed
* */
template <typename Function>
int initTasks(int elemPerBlock, const Function & Cmp);
int initTasks(int elemPerBlock, const Function &Cmp);
void processNewTasks();
};
//---------------------------------------------------------------------------------------------
template <typename Value>
template <typename Function>
void QUICKSORT<Value>::sort(const Function &Cmp)
{
while (tasksAmount > 0)
firstPhase(Cmp);
int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
if (total2ndPhase > 0)
secondPhase(Cmp);
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
return;
}
//---------------------------------------------------------------------------------------------
template <typename Value>
template <typename Function>
void QUICKSORT<Value>::firstPhase(const Function &Cmp)
{
while (host_1stPhaseTasksAmount > 0)
{
//2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array
if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks)
{
//2ndphase task is now full or host_1stPhaseTasksAmount is full, as backup during writing, overflowing tasks were written into the other array
if (host_1stPhaseTasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks)
break;
}
//just in case newly created tasks wouldnt fit
if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
{
if (host_1stPhaseTasksAmount * 2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount))
break;
}
int elemPerBlock = getElemPerBlock();
int blocksCnt = initTasks(elemPerBlock, Cmp);
if(blocksCnt >= cuda_blockToTaskMapping.getSize())
TNL_CHECK_CUDA_DEVICE;
if (blocksCnt >= maxBlocks) //too many blocks needed, switch to 2nd phase
break;
TNL_CHECK_CUDA_DEVICE;
//-----------------------------------------------
//do the partitioning
auto &task = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
int externMemByteSize = elemPerBlock * sizeof(Value);
auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks;
if(externMemByteSize <= deviceProp.sharedMemPerBlock)
/**
* check if can partition using shared memory for coalesced read and write
* 1st phase of partitioning
* sets of blocks work on a task
*
* using the atomicAdd intristic, each block reserves a chunk of memory where to move elements
* smaller and bigger than pivot move to
* */
if (externMemByteSize <= maxSharable)
{
cudaQuickSort1stPhase_1<Value, Function>
<<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
......@@ -345,110 +379,124 @@ void QUICKSORT<Value>::sort(const Function &Cmp)
cudaQuickSort1stPhase_2<Value, Function>
<<<blocksCnt, threadsPerBlock>>>(
arr, aux, Cmp, elemPerBlock,
task, cuda_blockToTaskMapping);
task, cuda_blockToTaskMapping);
}
TNL_CHECK_CUDA_DEVICE;
auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;
/**
* fill in the gap between smaller and bigger with elements == pivot
* after writing also create new tasks, each task generates at max 2 tasks
*
* tasks smaller than desired_2ndPhasElemPerBlock go into 2nd phase
* bigger need more blocks to partition and are written into newTask
* with iteration %2, rotate between the 2 tasks array to save from copying
* */
auto &newTask = iteration % 2 == 0 ? cuda_newTasks : cuda_tasks;
cudaWritePivot<Value>
<<<tasksAmount, 1024>>>(
arr, aux, desired_2ndPhasElemPerBlock,
task, newTask, cuda_newTasksAmount.getData(),
cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
<<<host_1stPhaseTasksAmount, 1024>>>(
arr, aux, desired_2ndPhasElemPerBlock,
task, newTask, cuda_newTasksAmount.getData(),
cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
TNL_CHECK_CUDA_DEVICE;
processNewTasks();
iteration++;
}
int total2ndPhase = tasksAmount + host_2ndPhaseTasksAmount;
if (total2ndPhase > 0)
}
//----------------------------------------------------------------------
template <typename Value>
template <typename Function>
void QUICKSORT<Value>::secondPhase(const Function &Cmp)
{
int total2ndPhase = host_1stPhaseTasksAmount + host_2ndPhaseTasksAmount;
const int stackSize = 32;
auto &leftoverTasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
if (host_1stPhaseTasksAmount > 0 && host_2ndPhaseTasksAmount > 0)
{
const int stackSize = 32;
if(tasksAmount >0 && host_2ndPhaseTasksAmount > 0)
{
auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount);
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks, tasks2);
}
else if(tasksAmount >0)
{
auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks);
}
else
{
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock>>>(arr, aux, Cmp, leftoverTasks, tasks2);
}
else if (host_1stPhaseTasksAmount > 0)
{
auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock>>>(arr, aux, Cmp, tasks);
}
else
{
auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks2);
}
cudaQuickSort2ndPhase<Value, Function, stackSize>
<<<total2ndPhase, threadsPerBlock>>>(arr, aux, Cmp, tasks2);
}
cudaDeviceSynchronize();
TNL_CHECK_CUDA_DEVICE;
return;
}
//----------------------------------------------------------------------
template <typename Value>
int QUICKSORT<Value>::getSetsNeeded(int elemPerBlock) const
{
auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
auto fetch = [=] __cuda_callable__(int i) {
auto &task = view[i];
const auto &task = view[i];
int size = task.partitionEnd - task.partitionBegin;
return size / elemPerBlock + (size % elemPerBlock != 0);
};
auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
return Algorithms::Reduction<Devices::Cuda>::reduce(0, tasksAmount, fetch, reduction, 0);
return Algorithms::Reduction<Devices::Cuda>::reduce(0, host_1stPhaseTasksAmount, fetch, reduction, 0);
}
template <typename Value>
int QUICKSORT<Value>::getElemPerBlock() const
{
int setsNeeded = getSetsNeeded(minElemPerBlock);
int setsNeeded = getSetsNeeded(desiredElemPerBlock);
if (setsNeeded <= maxBlocks)
return minElemPerBlock;
return desiredElemPerBlock;
//want multiplier*minElemPerBLock <= x*threadPerBlock
//find smallest x so that this inequality holds
double multiplier = 1. * setsNeeded / maxBlocks;
int elemPerBlock = multiplier * desiredElemPerBlock;
setsNeeded = elemPerBlock / threadsPerBlock + (elemPerBlock % threadsPerBlock != 0);
int setsPerBlock = ceil(1. * setsNeeded / maxBlocks);
return setsPerBlock * minElemPerBlock;
return setsNeeded * threadsPerBlock;
}
template <typename Value>
template <typename Function>
int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function & Cmp)
int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function &Cmp)
{
int threads = min(tasksAmount, threadsPerBlock);
int blocks = tasksAmount / threads + (tasksAmount % threads != 0);
int threads = min(host_1stPhaseTasksAmount, threadsPerBlock);
int blocks = host_1stPhaseTasksAmount / threads + (host_1stPhaseTasksAmount % threads != 0);
auto src = iteration % 2 == 0? arr : aux.getView();
auto &tasks = iteration % 2 == 0? cuda_tasks : cuda_newTasks;
auto src = iteration % 2 == 0 ? arr : aux.getView();
auto &tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;
//[i] == how many blocks task i needs
cudaCalcBlocksNeeded<<<threads, blocks>>>(tasks.getView(0, tasksAmount),
elemPerBlock, cuda_reductionTaskInitMem.getView(0, tasksAmount));
cudaCalcBlocksNeeded<<<threads, blocks>>>(tasks.getView(0, host_1stPhaseTasksAmount),
elemPerBlock, cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
thrust::inclusive_scan(thrust::device,
cuda_reductionTaskInitMem.getData(),
cuda_reductionTaskInitMem.getData() + tasksAmount,
cuda_reductionTaskInitMem.getData());
cuda_reductionTaskInitMem.getData(),
cuda_reductionTaskInitMem.getData() + host_1stPhaseTasksAmount,
cuda_reductionTaskInitMem.getData());
int blocksNeeded = cuda_reductionTaskInitMem.getElement(tasksAmount - 1);
int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
//need too many blocks, give back control
if(blocksNeeded >= cuda_blockToTaskMapping.getSize())
if (blocksNeeded >= cuda_blockToTaskMapping.getSize())
return blocksNeeded;
cudaInitTask<<<tasksAmount, 512>>>(
tasks.getView(0, tasksAmount),
cudaInitTask<<<host_1stPhaseTasksAmount, 512>>>(
tasks.getView(0, host_1stPhaseTasksAmount),
cuda_blockToTaskMapping.getView(0, blocksNeeded),
cuda_reductionTaskInitMem.getView(0, tasksAmount),
src, Cmp
);
cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount),
src, Cmp);
cuda_newTasksAmount.setElement(0, 0);
return blocksNeeded;
......@@ -457,7 +505,7 @@ int QUICKSORT<Value>::initTasks(int elemPerBlock, const Function & Cmp)
template <typename Value>
void QUICKSORT<Value>::processNewTasks()
{
tasksAmount = cuda_newTasksAmount.getElement(0);
host_1stPhaseTasksAmount = cuda_newTasksAmount.getElement(0);
host_2ndPhaseTasksAmount = cuda_2ndPhaseTasksAmount.getElement(0);
}
......@@ -468,12 +516,42 @@ void QUICKSORT<Value>::processNewTasks()
template <typename Value, typename Function>
void quicksort(ArrayView<Value, Devices::Cuda> arr, const Function &Cmp)
{
QUICKSORT<Value> sorter(arr);
const int maxBlocks = (1 << 20);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int sharedReserve = sizeof(Value) + sizeof(int) * 16; //1pivot + 16 other shared vars reserved
int maxSharable = deviceProp.sharedMemPerBlock - sharedReserve;
//blockDim*multiplier*sizeof(Value) <= maxSharable
int blockDim = 512; //best case
int elemPerBlock = maxSharable / sizeof(Value);
const int maxMultiplier = 8;
int multiplier = min(elemPerBlock / blockDim, maxMultiplier);
if (multiplier <= 0)
{
blockDim = 256;
multiplier = min(elemPerBlock / blockDim, maxMultiplier);
if (multiplier <= 0)
{
//worst case scenario, shared memory cant be utilized at all because of the sheer size of Value
//sort has to be done with the use of global memory alone
QUICKSORT<Value> sorter(arr, maxBlocks, 512, 0, maxSharable);
sorter.sort(Cmp);
return;
}
}
assert(blockDim * multiplier * sizeof(Value) <= maxSharable);
QUICKSORT<Value> sorter(arr, maxBlocks, blockDim, multiplier*blockDim, maxSharable);
sorter.sort(Cmp);
}
template <typename Value>
void quicksort(ArrayView<Value, Devices::Cuda> arr)
{
quicksort(arr, [] __cuda_callable__(const Value & a, const Value & b) { return a < b; });
quicksort(arr, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; });
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment