merge the 2 2nd phase launches and change elemPerBlock (7e72c87e) · Commits · TNL / GPUSort

src/quicksort/quicksort.cuh

+46 −17

Original line number	Original line	Diff line number	Diff line
	@@ -183,6 +183,28 @@ __global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, Array
	singleBlockQuickSort<Value, Function, stackSize>(arrView, auxView, Cmp, myTask.depth);		singleBlockQuickSort<Value, Function, stackSize>(arrView, auxView, Cmp, myTask.depth);
	}		}


			template <typename Value, typename Function, int stackSize>
			__global__ void cudaQuickSort2ndPhase(ArrayView<Value, Devices::Cuda> arr, ArrayView<Value, Devices::Cuda> aux,
			const Function &Cmp,
			ArrayView<TASK, Devices::Cuda> secondPhaseTasks1,
			ArrayView<TASK, Devices::Cuda> secondPhaseTasks2)
			{
			TASK myTask;
			if(blockIdx.x < secondPhaseTasks1.getSize())
			myTask = secondPhaseTasks1[blockIdx.x];
			else
			myTask = secondPhaseTasks2[blockIdx.x - secondPhaseTasks1.getSize()];

			if(myTask.partitionEnd - myTask.partitionBegin <= 0 )
			return;

			auto arrView = arr.getView(myTask.partitionBegin, myTask.partitionEnd);
			auto auxView = aux.getView(myTask.partitionBegin, myTask.partitionEnd);

			singleBlockQuickSort<Value, Function, stackSize>(arrView, auxView, Cmp, myTask.depth);
			}

	//-----------------------------------------------------------		//-----------------------------------------------------------

	__global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,		__global__ void cudaCalcBlocksNeeded(ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
	@@ -222,9 +244,9 @@ __global__ void cudaInitTask(ArrayView<TASK, Devices::Cuda> cuda_tasks,
	//-----------------------------------------------------------		//-----------------------------------------------------------
	const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k		const int threadsPerBlock = 512, g_maxBlocks = 1 << 15; //32k
	const int g_maxTasks = 1 << 14;		const int g_maxTasks = 1 << 14;
	const int minElemPerBlock = threadsPerBlock*2;		const int minElemPerBlock = threadsPerBlock*10;
	const int maxBitonicSize = threadsPerBlock*2;		const int maxBitonicSize = threadsPerBlock*2;
	const int desired_2ndPhasElemPerBlock = maxBitonicSize*8;		const int desired_2ndPhasElemPerBlock = maxBitonicSize;

	template<typename Value>		template<typename Value>
	class QUICKSORT		class QUICKSORT
	@@ -341,24 +363,31 @@ void QUICKSORT<Value>::sort(const Function &Cmp)
	iteration++;		iteration++;
	}		}

	if (tasksAmount > 0)		int total2ndPhase = tasksAmount + host_2ndPhaseTasksAmount;
			if (total2ndPhase > 0)
	{		{
	auto & tasks = iteration % 2 == 0 ? cuda_tasks : cuda_newTasks;		const int stackSize = 32;
	cudaQuickSort2ndPhase<Value, Function, 128>		if(tasksAmount >0 && host_2ndPhaseTasksAmount > 0)
	<<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks);		{
			auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount);
			auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);

	TNL_CHECK_CUDA_DEVICE;		cudaQuickSort2ndPhase<Value, Function, stackSize>
	cudaDeviceSynchronize();		<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks, tasks2);
	TNL_CHECK_CUDA_DEVICE;
	}		}
			else if(tasksAmount >0)
	if (host_2ndPhaseTasksAmount > 0)		{
			auto tasks = iteration % 2 == 0 ? cuda_tasks.getView(0, tasksAmount) : cuda_newTasks.getView(0, tasksAmount);
			cudaQuickSort2ndPhase<Value, Function, stackSize>
			<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks);
			}
			else
	{		{
	cudaQuickSort2ndPhase<Value, Function, 128>		auto tasks2 = cuda_2ndPhaseTasks.getView(0, host_2ndPhaseTasksAmount);
	<<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>
	(arr, aux, Cmp, cuda_2ndPhaseTasks);

	TNL_CHECK_CUDA_DEVICE;		cudaQuickSort2ndPhase<Value, Function, stackSize>
			<<<total2ndPhase , threadsPerBlock>>>(arr, aux, Cmp, tasks2);
			}
	}		}
	cudaDeviceSynchronize();		cudaDeviceSynchronize();
	TNL_CHECK_CUDA_DEVICE;		TNL_CHECK_CUDA_DEVICE;