Loading src/quicksort/quicksort.cuh +14 −15 Original line number Original line Diff line number Diff line Loading @@ -251,9 +251,7 @@ public: cuda_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; iteration = 0; auto error = cudaGetLastError(); TNL_CHECK_CUDA_DEVICE; if(error != cudaSuccess) deb(error); } } template <typename Function> template <typename Function> Loading @@ -274,7 +272,6 @@ public: template <typename Function> template <typename Function> void QUICKSORT::sort(const Function &Cmp) void QUICKSORT::sort(const Function &Cmp) { { cudaError_t error; while (tasksAmount > 0) while (tasksAmount > 0) { { Loading @@ -295,28 +292,32 @@ void QUICKSORT::sort(const Function &Cmp) if(blocksCnt > cuda_blockToTaskMapping.getSize()) if(blocksCnt > cuda_blockToTaskMapping.getSize()) break; break; TNL_CHECK_CUDA_DEVICE; int externMemByteSize = elemPerBlock * sizeof(int); int externMemByteSize = elemPerBlock * sizeof(int); auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; cudaQuickSort1stPhase<Function> cudaQuickSort1stPhase<Function> <<<blocksCnt, threadsPerBlock, externMemByteSize>>>( <<<blocksCnt, threadsPerBlock, externMemByteSize>>>( arr, aux, Cmp, elemPerBlock, arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); task, cuda_blockToTaskMapping); TNL_CHECK_CUDA_DEVICE; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<<tasksAmount, 512>>>( cudaWritePivot<<<tasksAmount, 512>>>( arr, aux, desired_2ndPhasElemPerBlock, arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); TNL_CHECK_CUDA_DEVICE; processNewTasks(); processNewTasks(); iteration++; iteration++; } } if((error = cudaDeviceSynchronize()) != cudaSuccess) cudaDeviceSynchronize(); { TNL_CHECK_CUDA_DEVICE; deb(error); return; } if (tasksAmount > 0) if (tasksAmount > 0) { { Loading @@ -324,6 +325,7 @@ void QUICKSORT::sort(const Function &Cmp) cudaQuickSort2ndPhase<Function, 128> cudaQuickSort2ndPhase<Function, 128> <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks); <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks); } } TNL_CHECK_CUDA_DEVICE; if (host_2ndPhaseTasksAmount > 0) if (host_2ndPhaseTasksAmount > 0) { { Loading @@ -331,13 +333,10 @@ void QUICKSORT::sort(const Function &Cmp) <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> (arr, aux, Cmp, cuda_2ndPhaseTasks); (arr, aux, Cmp, cuda_2ndPhaseTasks); } } TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); if((error = cudaDeviceSynchronize()) != cudaSuccess) TNL_CHECK_CUDA_DEVICE; { deb(error); return; } return; return; } } Loading Loading
src/quicksort/quicksort.cuh +14 −15 Original line number Original line Diff line number Diff line Loading @@ -251,9 +251,7 @@ public: cuda_2ndPhaseTasksAmount = 0; cuda_2ndPhaseTasksAmount = 0; iteration = 0; iteration = 0; auto error = cudaGetLastError(); TNL_CHECK_CUDA_DEVICE; if(error != cudaSuccess) deb(error); } } template <typename Function> template <typename Function> Loading @@ -274,7 +272,6 @@ public: template <typename Function> template <typename Function> void QUICKSORT::sort(const Function &Cmp) void QUICKSORT::sort(const Function &Cmp) { { cudaError_t error; while (tasksAmount > 0) while (tasksAmount > 0) { { Loading @@ -295,28 +292,32 @@ void QUICKSORT::sort(const Function &Cmp) if(blocksCnt > cuda_blockToTaskMapping.getSize()) if(blocksCnt > cuda_blockToTaskMapping.getSize()) break; break; TNL_CHECK_CUDA_DEVICE; int externMemByteSize = elemPerBlock * sizeof(int); int externMemByteSize = elemPerBlock * sizeof(int); auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks; cudaQuickSort1stPhase<Function> cudaQuickSort1stPhase<Function> <<<blocksCnt, threadsPerBlock, externMemByteSize>>>( <<<blocksCnt, threadsPerBlock, externMemByteSize>>>( arr, aux, Cmp, elemPerBlock, arr, aux, Cmp, elemPerBlock, task, cuda_blockToTaskMapping); task, cuda_blockToTaskMapping); TNL_CHECK_CUDA_DEVICE; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks; cudaWritePivot<<<tasksAmount, 512>>>( cudaWritePivot<<<tasksAmount, 512>>>( arr, aux, desired_2ndPhasElemPerBlock, arr, aux, desired_2ndPhasElemPerBlock, task, newTask, cuda_newTasksAmount.getData(), task, newTask, cuda_newTasksAmount.getData(), cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData()); TNL_CHECK_CUDA_DEVICE; processNewTasks(); processNewTasks(); iteration++; iteration++; } } if((error = cudaDeviceSynchronize()) != cudaSuccess) cudaDeviceSynchronize(); { TNL_CHECK_CUDA_DEVICE; deb(error); return; } if (tasksAmount > 0) if (tasksAmount > 0) { { Loading @@ -324,6 +325,7 @@ void QUICKSORT::sort(const Function &Cmp) cudaQuickSort2ndPhase<Function, 128> cudaQuickSort2ndPhase<Function, 128> <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks); <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks); } } TNL_CHECK_CUDA_DEVICE; if (host_2ndPhaseTasksAmount > 0) if (host_2ndPhaseTasksAmount > 0) { { Loading @@ -331,13 +333,10 @@ void QUICKSORT::sort(const Function &Cmp) <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>> (arr, aux, Cmp, cuda_2ndPhaseTasks); (arr, aux, Cmp, cuda_2ndPhaseTasks); } } TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); if((error = cudaDeviceSynchronize()) != cudaSuccess) TNL_CHECK_CUDA_DEVICE; { deb(error); return; } return; return; } } Loading