change bitonic threshold size (bccf9bb2) · Commits · TNL / GPUSort

src/quicksort/quicksort.cuh

+4 −4

Original line number	Diff line number	Diff line
		@@ -30,7 +30,7 @@ class QUICKSORT

		//--------------------------------------

		const int maxBitonicSize = threadsPerBlock * 2;
		const int maxBitonicSize = threadsPerBlock * 8;
		const int desired_2ndPhasElemPerBlock = maxBitonicSize;
		const int g_maxTasks = 1 << 14;
		int maxTasks;
		@@ -284,20 +284,20 @@ void QUICKSORT<Value>::secondPhase(const Function &Cmp)
		auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);

		cudaQuickSort2ndPhase<Value, Function, stackSize>
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared);
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, tasks2, elemInShared, maxBitonicSize);
		}
		else if (host_1stPhaseTasksAmount > 0)
		{
		auto tasks = leftoverTasks.getView(0, host_1stPhaseTasksAmount);
		cudaQuickSort2ndPhase<Value, Function, stackSize>
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared);
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks, elemInShared, maxBitonicSize);
		}
		else
		{
		auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);

		cudaQuickSort2ndPhase<Value, Function, stackSize>
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared);
		<<<total2ndPhase, threadsPerBlock, externSharedByteSize>>>(arr, aux, Cmp, tasks2, elemInShared, maxBitonicSize);
		}
		}

src/quicksort/quicksort_1Block.cuh

+4 −3

Original line number	Diff line number	Diff line
		@@ -39,9 +39,10 @@ template <typename Value, typename Function, int stackSize, bool useShared>
		__device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		ArrayView<Value, TNL::Devices::Cuda> aux,
		const Function &Cmp, int _depth,
		Value *sharedMem, int memSize)
		Value *sharedMem, int memSize,
		int maxBitonicSize)
		{
		if (arr.getSize() <= blockDim.x * 2)
		if (arr.getSize() <= maxBitonicSize)
		{
		auto &src = (_depth & 1) == 0 ? arr : aux;
		if (useShared && arr.getSize() <= memSize)
		@@ -91,7 +92,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		auto &src = (depth & 1) == 0 ? arr : aux;

		//small enough for for bitonic
		if (size <= blockDim.x * 2)
		if (size <= maxBitonicSize)
		{
		if (useShared && size <= memSize)
		externSort<Value, Function>(src.getView(begin, end), arr.getView(begin, end), Cmp, sharedMem);

src/quicksort/quicksort_kernel.cuh

+10 −6

Original line number	Diff line number	Diff line
		@@ -185,7 +185,7 @@ template <typename Value, typename Function, int stackSize>
		__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
		const Function &Cmp,
		ArrayView<TASK, Devices::Cuda> secondPhaseTasks,
		int elemInShared)
		int elemInShared, int maxBitonicSize)
		{
		extern __shared__ int externMem[];
		Value sharedMem = (Value )externMem;
		@@ -202,11 +202,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

		if (elemInShared == 0)
		{
		singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0);
		singleBlockQuickSort<Value, Function, stackSize, false>
		(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
		}
		else
		{
		singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
		singleBlockQuickSort<Value, Function, stackSize, true>
		(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
		}
		}

		@@ -215,7 +217,7 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
		const Function &Cmp,
		ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
		ArrayView<TASK, Devices::Cuda> secondPhaseTasks2,
		int elemInShared)
		int elemInShared, int maxBitonicSize)
		{
		extern __shared__ int externMem[];
		Value sharedMem = (Value )externMem;
		@@ -237,11 +239,13 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

		if (elemInShared == 0)
		{
		singleBlockQuickSort<Value, Function, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0);
		singleBlockQuickSort<Value, Function, stackSize, false>
		(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
		}
		else
		{
		singleBlockQuickSort<Value, Function, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared);
		singleBlockQuickSort<Value, Function, stackSize, true>
		(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
		}
		}