Loading src/quicksort/cudaPartition.cuh +2 −2 Original line number Diff line number Diff line Loading @@ -134,10 +134,10 @@ void copyData(ArrayView<int, Devices::Cuda> src, } else if (data > pivot) { /* if(biggerStart >= dst.getSize() || biggerStart < 0) printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); */ dst[biggerStart++] = data; } } Loading src/quicksort/quicksort.cuh +16 −16 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) { Loading @@ -29,7 +29,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, if (size == 0) return; if (size <= blockDim.x * 2) if (size <= maxElemFor2ndPhase) { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) Loading Loading @@ -98,7 +98,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView<int, Devices::Cuda> arr, ArrayVi template <typename Function> __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, Devices::Cuda> aux, const Function &Cmp, int elemPerBlock, const Function &Cmp, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) Loading Loading @@ -138,7 +138,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, arr.getView(leftBegin, leftEnd) , Cmp) + leftBegin; writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } if(rightEnd - rightBegin > 0) Loading @@ -148,7 +149,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, arr.getView(rightBegin, rightEnd) , Cmp) + rightBegin; writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } } Loading Loading @@ -216,6 +218,8 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks, const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; const int minElemPerBlock = threadsPerBlock*2; const int maxBitonicSize = threadsPerBlock*2; const int desired_2ndPhasElemPerBlock = maxBitonicSize*8; class QUICKSORT { Loading Loading @@ -283,13 +287,14 @@ void QUICKSORT::sort(const Function &Cmp) //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) { //deb("task overflow") break; } //just in case newly created tasks wouldnt fit if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) { break; } int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); Loading @@ -305,7 +310,7 @@ void QUICKSORT::sort(const Function &Cmp) auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<<tasksAmount, 512>>>( arr, aux, Cmp, elemPerBlock, arr, aux, Cmp, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), Loading @@ -330,14 +335,9 @@ void QUICKSORT::sort(const Function &Cmp) if (host_2ndPhaseTasksAmount > 0) { cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase<Function, 128> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock, 0, s>>> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> (arr, aux, Cmp, cuda_2ndPhaseTasks); cudaStreamDestroy(s); } Loading Loading
src/quicksort/cudaPartition.cuh +2 −2 Original line number Diff line number Diff line Loading @@ -134,10 +134,10 @@ void copyData(ArrayView<int, Devices::Cuda> src, } else if (data > pivot) { /* if(biggerStart >= dst.getSize() || biggerStart < 0) printf("failed here: b:%d t:%d: tried to write into [%d]/%d\n", blockDim.x, threadIdx.x, biggerStart, dst.getSize()); */ dst[biggerStart++] = data; } } Loading
src/quicksort/quicksort.cuh +16 −16 Original line number Diff line number Diff line Loading @@ -15,7 +15,7 @@ using namespace TNL::Containers; //----------------------------------------------------------- __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) { Loading @@ -29,7 +29,7 @@ __device__ void writeNewTask(int begin, int end, int depth, int pivotIdx, if (size == 0) return; if (size <= blockDim.x * 2) if (size <= maxElemFor2ndPhase) { int idx = atomicAdd(secondPhaseTasksCnt, 1); if (idx < secondPhaseTasks.getSize()) Loading Loading @@ -98,7 +98,7 @@ __global__ void cudaQuickSort1stPhase(ArrayView<int, Devices::Cuda> arr, ArrayVi template <typename Function> __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, Devices::Cuda> aux, const Function &Cmp, int elemPerBlock, const Function &Cmp, int maxElemFor2ndPhase, ArrayView<TASK, Devices::Cuda> tasks, ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt) Loading Loading @@ -138,7 +138,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, arr.getView(leftBegin, leftEnd) , Cmp) + leftBegin; writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); writeNewTask(leftBegin, leftEnd, myTask.depth, leftPivotIdx, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } if(rightEnd - rightBegin > 0) Loading @@ -148,7 +149,8 @@ __global__ void cudaWritePivot(ArrayView<int, Devices::Cuda> arr, ArrayView<int, arr.getView(rightBegin, rightEnd) , Cmp) + rightBegin; writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); writeNewTask(rightBegin, rightEnd, myTask.depth, rightPivotIdx, maxElemFor2ndPhase, newTasks, newTasksCnt, secondPhaseTasks, secondPhaseTasksCnt); } } Loading Loading @@ -216,6 +218,8 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks, const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k const int g_maxTasks = 1 << 14; const int minElemPerBlock = threadsPerBlock*2; const int maxBitonicSize = threadsPerBlock*2; const int desired_2ndPhasElemPerBlock = maxBitonicSize*8; class QUICKSORT { Loading Loading @@ -283,13 +287,14 @@ void QUICKSORT::sort(const Function &Cmp) //2ndphase task is now full or tasksAmount is full, as backup during writing, overflowing tasks were written into the other array if (tasksAmount >= maxTasks || host_2ndPhaseTasksAmount >= maxTasks) { //deb("task overflow") break; } //just in case newly created tasks wouldnt fit if(tasksAmount*2 >= maxTasks + (maxTasks - host_2ndPhaseTasksAmount)) { break; } int elemPerBlock = getElemPerBlock(); int blocksCnt = initTasks(elemPerBlock); Loading @@ -305,7 +310,7 @@ void QUICKSORT::sort(const Function &Cmp) auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<<tasksAmount, 512>>>( arr, aux, Cmp, elemPerBlock, arr, aux, Cmp, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), Loading @@ -330,14 +335,9 @@ void QUICKSORT::sort(const Function &Cmp) if (host_2ndPhaseTasksAmount > 0) { cudaStream_t s; cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking); cudaQuickSort2ndPhase<Function, 128> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock, 0, s>>> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> (arr, aux, Cmp, cuda_2ndPhaseTasks); cudaStreamDestroy(s); } Loading