refactor depth -> iteration (2f143034) · Commits · TNL / GPUSort

src/quicksort/quicksort_1Block.cuh

+16 −16

Original line number	Diff line number	Diff line
		@@ -31,27 +31,27 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
		int stackDepth[], int &stackTop,
		int begin, int pivotBegin,
		int pivotEnd, int end,
		int depth);
		int iteration);

		//---------------------------------------------------------------

		template <typename Value, typename CMP, int stackSize, bool useShared>
		__device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		ArrayView<Value, TNL::Devices::Cuda> aux,
		const CMP &Cmp, int _depth,
		const CMP &Cmp, int _iteration,
		Value *sharedMem, int memSize,
		int maxBitonicSize)
		{
		if (arr.getSize() <= maxBitonicSize)
		{
		auto &src = (_depth & 1) == 0 ? arr : aux;
		auto &src = (_iteration & 1) == 0 ? arr : aux;
		if (useShared && arr.getSize() <= memSize)
		externSort<Value, CMP>(src, arr, Cmp, sharedMem);
		else
		{
		externSort<Value, CMP>(src, Cmp);
		//extern sort without shared memory only works in-place, need to copy into from aux
		if ((_depth & 1) != 0)
		if ((_iteration & 1) != 0)
		for (int i = threadIdx.x; i < arr.getSize(); i += blockDim.x)
		arr[i] = src[i];
		}
		@@ -61,7 +61,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,

		static __shared__ int stackTop;
		static __shared__ int stackArrBegin[stackSize], stackArrEnd[stackSize], stackDepth[stackSize];
		static __shared__ int begin, end, depth;
		static __shared__ int begin, end, iteration;
		static __shared__ int pivotBegin, pivotEnd;
		Value *piv = sharedMem;
		sharedMem += 1;
		@@ -71,7 +71,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		stackTop = 0;
		stackArrBegin[stackTop] = 0;
		stackArrEnd[stackTop] = arr.getSize();
		stackDepth[stackTop] = _depth;
		stackDepth[stackTop] = _iteration;
		stackTop++;
		}
		__syncthreads();
		@@ -83,13 +83,13 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		{
		begin = stackArrBegin[stackTop - 1];
		end = stackArrEnd[stackTop - 1];
		depth = stackDepth[stackTop - 1];
		iteration = stackDepth[stackTop - 1];
		stackTop--;
		}
		__syncthreads();

		int size = end - begin;
		auto &src = (depth & 1) == 0 ? arr : aux;
		auto &src = (iteration & 1) == 0 ? arr : aux;

		//small enough for for bitonic
		if (size <= maxBitonicSize)
		@@ -100,7 +100,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		{
		externSort<Value, CMP>(src.getView(begin, end), Cmp);
		//extern sort without shared memory only works in-place, need to copy into from aux
		if ((depth & 1) != 0)
		if ((iteration & 1) != 0)
		for (int i = threadIdx.x; i < src.getSize(); i += blockDim.x)
		arr[begin + i] = src[i];
		}
		@@ -134,7 +134,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		* move elements, either use shared mem for coalesced access or without shared mem if data is too big
		* */

		auto &dst = (depth & 1) == 0 ? aux : arr;
		auto &dst = (iteration & 1) == 0 ? aux : arr;

		if (useShared && size <= memSize)
		{
		@@ -172,7 +172,7 @@ __device__ void singleBlockQuickSort(ArrayView<Value, TNL::Devices::Cuda> arr,
		stackPush<stackSize>(stackArrBegin, stackArrEnd, stackDepth, stackTop,
		begin, begin + pivotBegin,
		begin + pivotEnd, end,
		depth);
		iteration);
		}
		__syncthreads(); //sync to update stackTop
		} //ends while loop
		@@ -185,7 +185,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
		int stackDepth[], int &stackTop,
		int begin, int pivotBegin,
		int pivotEnd, int end,
		int depth)
		int iteration)
		{
		int sizeL = pivotBegin - begin, sizeR = end - pivotEnd;

		@@ -197,7 +197,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
		{
		stackArrBegin[stackTop] = begin;
		stackArrEnd[stackTop] = pivotBegin;
		stackDepth[stackTop] = depth + 1;
		stackDepth[stackTop] = iteration + 1;
		stackTop++;
		}

		@@ -207,7 +207,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],

		stackArrBegin[stackTop] = pivotEnd;
		stackArrEnd[stackTop] = end;
		stackDepth[stackTop] = depth + 1;
		stackDepth[stackTop] = iteration + 1;
		stackTop++;
		}
		}
		@@ -217,7 +217,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],
		{
		stackArrBegin[stackTop] = pivotEnd;
		stackArrEnd[stackTop] = end;
		stackDepth[stackTop] = depth + 1;
		stackDepth[stackTop] = iteration + 1;
		stackTop++;
		}

		@@ -227,7 +227,7 @@ __device__ void stackPush(int stackArrBegin[], int stackArrEnd[],

		stackArrBegin[stackTop] = begin;
		stackArrEnd[stackTop] = pivotBegin;
		stackDepth[stackTop] = depth + 1;
		stackDepth[stackTop] = iteration + 1;
		stackTop++;
		}
		}

src/quicksort/quicksort_kernel.cuh

+15 −15

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@ using namespace TNL::Containers;

		//-----------------------------------------------------------

		__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
		__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
		ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
		ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt);

		@@ -67,8 +67,8 @@ __global__ void cudaQuickSort1stPhase(ArrayView<Value, Devices::Cuda> arr, Array
		Value *sharedMem = piv + 1;

		TASK &myTask = tasks[taskMapping[blockIdx.x]];
		auto &src = (myTask.depth & 1) == 0 ? arr : aux;
		auto &dst = (myTask.depth & 1) == 0 ? aux : arr;
		auto &src = (myTask.iteration & 1) == 0 ? arr : aux;
		auto &dst = (myTask.iteration & 1) == 0 ? aux : arr;

		if (threadIdx.x == 0)
		*piv = src[myTask.pivotIdx];
		@@ -95,7 +95,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
		TASK &myTask = tasks[blockIdx.x];

		if (threadIdx.x == 0)
		*piv = (myTask.depth & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
		*piv = (myTask.iteration & 1) == 0 ? arr[myTask.pivotIdx] : aux[myTask.pivotIdx];
		__syncthreads();
		Value &pivot = *piv;

		@@ -117,7 +117,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va

		if (leftEnd - leftBegin > 0)
		{
		writeNewTask(leftBegin, leftEnd, myTask.depth,
		writeNewTask(leftBegin, leftEnd, myTask.iteration,
		maxElemFor2ndPhase,
		newTasks, newTasksCnt,
		secondPhaseTasks, secondPhaseTasksCnt);
		@@ -126,7 +126,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va
		if (rightEnd - rightBegin > 0)
		{
		writeNewTask(rightBegin, rightEnd,
		myTask.depth, maxElemFor2ndPhase,
		myTask.iteration, maxElemFor2ndPhase,
		newTasks, newTasksCnt,
		secondPhaseTasks, secondPhaseTasksCnt);
		}
		@@ -134,7 +134,7 @@ __global__ void cudaWritePivot(ArrayView<Value, Devices::Cuda> arr, ArrayView<Va

		//-----------------------------------------------------------

		__device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPhase,
		__device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2ndPhase,
		ArrayView<TASK, Devices::Cuda> newTasks, int *newTasksCnt,
		ArrayView<TASK, Devices::Cuda> secondPhaseTasks, int *secondPhaseTasksCnt)
		{
		@@ -152,13 +152,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
		{
		int idx = atomicAdd(secondPhaseTasksCnt, 1);
		if (idx < secondPhaseTasks.getSize())
		secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
		secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
		else
		{
		//printf("ran out of memory, trying backup\n");
		int idx = atomicAdd(newTasksCnt, 1);
		if (idx < newTasks.getSize())
		newTasks[idx] = TASK(begin, end, depth + 1);
		newTasks[idx] = TASK(begin, end, iteration + 1);
		else
		printf("ran out of memory for second phase task, there isnt even space in newTask list\nPart of array may stay unsorted!!!\n");
		}
		@@ -167,13 +167,13 @@ __device__ void writeNewTask(int begin, int end, int depth, int maxElemFor2ndPha
		{
		int idx = atomicAdd(newTasksCnt, 1);
		if (idx < newTasks.getSize())
		newTasks[idx] = TASK(begin, end, depth + 1);
		newTasks[idx] = TASK(begin, end, iteration + 1);
		else
		{
		//printf("ran out of memory, trying backup\n");
		int idx = atomicAdd(secondPhaseTasksCnt, 1);
		if (idx < secondPhaseTasks.getSize())
		secondPhaseTasks[idx] = TASK(begin, end, depth + 1);
		secondPhaseTasks[idx] = TASK(begin, end, iteration + 1);
		else
		printf("ran out of memory for newtask, there isnt even space in second phase task list\nPart of array may stay unsorted!!!\n");
		}
		@@ -203,11 +203,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

		if (elemInShared == 0)
		{
		singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
		singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
		}
		else
		{
		singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
		singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
		}
		}

		@@ -238,11 +238,11 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array

		if (elemInShared <= 0)
		{
		singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.depth, sharedMem, 0, maxBitonicSize);
		singleBlockQuickSort<Value, CMP, stackSize, false>(arrView, auxView, Cmp, myTask.iteration, sharedMem, 0, maxBitonicSize);
		}
		else
		{
		singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.depth, sharedMem, elemInShared, maxBitonicSize);
		singleBlockQuickSort<Value, CMP, stackSize, true>(arrView, auxView, Cmp, myTask.iteration, sharedMem, elemInShared, maxBitonicSize);
		}
		}

src/quicksort/task.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -7,15 +7,15 @@ struct TASK
		//-----------------------------------------------
		//helper variables for blocks working on this task

		int depth;
		int iteration;
		int pivotIdx;
		int dstBegin, dstEnd;
		int firstBlock, blockCount;//for workers read only values

		__cuda_callable__
		TASK(int begin, int end, int depth)
		TASK(int begin, int end, int iteration)
		: partitionBegin(begin), partitionEnd(end),
		depth(depth), pivotIdx(-1),
		iteration(iteration), pivotIdx(-1),
		dstBegin(-151561), dstEnd(-151561),
		firstBlock(-100), blockCount(-100)
		{}
		@@ -42,7 +42,7 @@ std::ostream& operator<<(std::ostream & out, const TASK & task)
		{
		out << "[ ";
		out << task.partitionBegin << " - " << task.partitionEnd;
		out << " \| " << "depth: " << task.depth;
		out << " \| " << "iteration: " << task.iteration;
		out << " \| " << "pivotIdx: " << task.pivotIdx;
		return out << " ] ";
		}
		No newline at end of file