Loading src/quicksort/quicksort.cuh +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class QUICKSORT //-------------------------------------- const int maxBitonicSize = threadsPerBlock * 2; const int maxBitonicSize = threadsPerBlock * 8; const int desired_2ndPhasElemPerBlock = maxBitonicSize; const int g_maxTasks = 1 << 14; int maxTasks; Loading Loading @@ -284,20 +284,20 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp) auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize); } } Loading src/quicksort/quicksort_1Block.cuh +4 −3 Original line number Diff line number Diff line Loading @@ -39,9 +39,10 @@ template <typename Value, typename Function, int stackSize, bool useShared> __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, ArrayView<Value, TNL::Devices::Cuda> aux, const Function &Cmp, int _depth, Value *sharedMem, int memSize) Value *sharedMem, int memSize, int maxBitonicSize) { if (arr.getSize() <= blockDim.x * 2) if (arr.getSize() <= maxBitonicSize) { auto &src = (_depth & 1) == 0 ? arr : aux; if (useShared && arr.getSize() <= memSize) Loading Loading @@ -91,7 +92,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, auto &src = (depth & 1) == 0 ? arr : aux; //small enough for for bitonic if (size <= blockDim.x * 2) if (size <= maxBitonicSize) { if (useShared && size <= memSize) externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); Loading src/quicksort/quicksort_kernel.cuh +10 −6 Original line number Diff line number Diff line Loading @@ -185,7 +185,7 @@ template <typename Value, typename Function, int stackSize> __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int elemInShared) int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; Loading @@ -202,11 +202,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); singleBlockQuickSort<Value, Function, stackSize, false> (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); singleBlockQuickSort<Value, Function, stackSize, true> (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } Loading @@ -215,7 +217,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks1, ArrayView<TASK, Devices::Cuda> secondPhaseTasks2, int elemInShared) int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; Loading @@ -237,11 +239,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); singleBlockQuickSort<Value, Function, stackSize, false> (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); singleBlockQuickSort<Value, Function, stackSize, true> (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } Loading Loading
src/quicksort/quicksort.cuh +4 −4 Original line number Diff line number Diff line Loading @@ -30,7 +30,7 @@ class QUICKSORT //-------------------------------------- const int maxBitonicSize = threadsPerBlock * 2; const int maxBitonicSize = threadsPerBlock * 8; const int desired_2ndPhasElemPerBlock = maxBitonicSize; const int g_maxTasks = 1 << 14; int maxTasks; Loading Loading @@ -284,20 +284,20 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp) auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize); } else if (host_1stPhaseTasksAmount > 0) { auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize); } else { auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount); cudaQuickSort2ndPhase<Value, Function, stackSize> <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared); <<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize); } } Loading
src/quicksort/quicksort_1Block.cuh +4 −3 Original line number Diff line number Diff line Loading @@ -39,9 +39,10 @@ template <typename Value, typename Function, int stackSize, bool useShared> __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, ArrayView<Value, TNL::Devices::Cuda> aux, const Function &Cmp, int _depth, Value *sharedMem, int memSize) Value *sharedMem, int memSize, int maxBitonicSize) { if (arr.getSize() <= blockDim.x * 2) if (arr.getSize() <= maxBitonicSize) { auto &src = (_depth & 1) == 0 ? arr : aux; if (useShared && arr.getSize() <= memSize) Loading Loading @@ -91,7 +92,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr, auto &src = (depth & 1) == 0 ? arr : aux; //small enough for for bitonic if (size <= blockDim.x * 2) if (size <= maxBitonicSize) { if (useShared && size <= memSize) externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem); Loading
src/quicksort/quicksort_kernel.cuh +10 −6 Original line number Diff line number Diff line Loading @@ -185,7 +185,7 @@ template <typename Value, typename Function, int stackSize> __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux, const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int elemInShared) int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; Loading @@ -202,11 +202,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); singleBlockQuickSort<Value, Function, stackSize, false> (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); singleBlockQuickSort<Value, Function, stackSize, true> (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } Loading @@ -215,7 +217,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array const Function &Cmp, ArrayView<TASK, Devices::Cuda> secondPhaseTasks1, ArrayView<TASK, Devices::Cuda> secondPhaseTasks2, int elemInShared) int elemInShared, int maxBitonicSize) { extern __shared__ int externMem[]; Value *sharedMem = (Value *)externMem; Loading @@ -237,11 +239,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array if (elemInShared == 0) { singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0); singleBlockQuickSort<Value, Function, stackSize, false> (arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize); } else { singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared); singleBlockQuickSort<Value, Function, stackSize, true> (arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize); } } Loading