proper checking (25177e29) · Commits · TNL / GPUSort

src/quicksort/quicksort.cuh

+14 −15

Original line number	Original line	Diff line number	Diff line
	@@ -251,9 +251,7 @@ public:
	cuda_2ndPhaseTasksAmount = 0;		cuda_2ndPhaseTasksAmount = 0;
	iteration = 0;		iteration = 0;

	auto error = cudaGetLastError();		TNL_CHECK_CUDA_DEVICE;
	if(error != cudaSuccess)
	deb(error);
	}		}

	template <typename Function>		template <typename Function>
	@@ -274,7 +272,6 @@ public:
	template <typename Function>		template <typename Function>
	void QUICKSORT::sort(const Function &Cmp)		void QUICKSORT::sort(const Function &Cmp)
	{		{
	cudaError_t error;

	while (tasksAmount > 0)		while (tasksAmount > 0)
	{		{
	@@ -295,28 +292,32 @@ void QUICKSORT::sort(const Function &Cmp)
	if(blocksCnt > cuda_blockToTaskMapping.getSize())		if(blocksCnt > cuda_blockToTaskMapping.getSize())
	break;		break;

			TNL_CHECK_CUDA_DEVICE;

	int externMemByteSize = elemPerBlock * sizeof(int);		int externMemByteSize = elemPerBlock * sizeof(int);
	auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks;		auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks;

	cudaQuickSort1stPhase<Function>		cudaQuickSort1stPhase<Function>
	<<<blocksCnt, threadsPerBlock, externMemByteSize>>>(		<<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
	arr, aux, Cmp, elemPerBlock,		arr, aux, Cmp, elemPerBlock,
	task, cuda_blockToTaskMapping);		task, cuda_blockToTaskMapping);

			TNL_CHECK_CUDA_DEVICE;

	auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;		auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;
	cudaWritePivot<<<tasksAmount, 512>>>(		cudaWritePivot<<<tasksAmount, 512>>>(
	arr, aux, desired_2ndPhasElemPerBlock,		arr, aux, desired_2ndPhasElemPerBlock,
	task, newTask, cuda_newTasksAmount.getData(),		task, newTask, cuda_newTasksAmount.getData(),
	cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());		cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());

			TNL_CHECK_CUDA_DEVICE;

	processNewTasks();		processNewTasks();
	iteration++;		iteration++;
	}		}

	if((error = cudaDeviceSynchronize()) != cudaSuccess)		cudaDeviceSynchronize();
	{		TNL_CHECK_CUDA_DEVICE;
	deb(error);
	return;
	}

	if (tasksAmount > 0)		if (tasksAmount > 0)
	{		{
	@@ -324,6 +325,7 @@ void QUICKSORT::sort(const Function &Cmp)
	cudaQuickSort2ndPhase<Function, 128>		cudaQuickSort2ndPhase<Function, 128>
	<<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks);		<<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks);
	}		}
			TNL_CHECK_CUDA_DEVICE;

	if (host_2ndPhaseTasksAmount > 0)		if (host_2ndPhaseTasksAmount > 0)
	{		{
	@@ -331,13 +333,10 @@ void QUICKSORT::sort(const Function &Cmp)
	<<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>		<<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>
	(arr, aux, Cmp, cuda_2ndPhaseTasks);		(arr, aux, Cmp, cuda_2ndPhaseTasks);
	}		}
			TNL_CHECK_CUDA_DEVICE;

			cudaDeviceSynchronize();
	if((error = cudaDeviceSynchronize()) != cudaSuccess)		TNL_CHECK_CUDA_DEVICE;
	{
	deb(error);
	return;
	}
	return;		return;
	}		}