Commit 25177e29 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

proper checking

parent 4227d319
Loading
Loading
Loading
Loading
+14 −15
Original line number Original line Diff line number Diff line
@@ -251,9 +251,7 @@ public:
        cuda_2ndPhaseTasksAmount = 0;
        cuda_2ndPhaseTasksAmount = 0;
        iteration = 0;
        iteration = 0;


        auto error = cudaGetLastError();
        TNL_CHECK_CUDA_DEVICE;
        if(error != cudaSuccess)
            deb(error);
    }
    }


    template <typename Function>
    template <typename Function>
@@ -274,7 +272,6 @@ public:
template <typename Function>
template <typename Function>
void QUICKSORT::sort(const Function &Cmp)
void QUICKSORT::sort(const Function &Cmp)
{
{
    cudaError_t error;
    
    
    while (tasksAmount > 0)
    while (tasksAmount > 0)
    {
    {
@@ -295,28 +292,32 @@ void QUICKSORT::sort(const Function &Cmp)
        if(blocksCnt > cuda_blockToTaskMapping.getSize())
        if(blocksCnt > cuda_blockToTaskMapping.getSize())
            break;
            break;


        TNL_CHECK_CUDA_DEVICE;

        int externMemByteSize = elemPerBlock * sizeof(int);
        int externMemByteSize = elemPerBlock * sizeof(int);
        auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks;
        auto & task = iteration % 2 == 0? cuda_tasks : cuda_newTasks;

        cudaQuickSort1stPhase<Function>
        cudaQuickSort1stPhase<Function>
            <<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
            <<<blocksCnt, threadsPerBlock, externMemByteSize>>>(
                arr, aux, Cmp, elemPerBlock,
                arr, aux, Cmp, elemPerBlock,
                task, cuda_blockToTaskMapping);
                task, cuda_blockToTaskMapping);
                
                
        TNL_CHECK_CUDA_DEVICE;

        auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;
        auto & newTask = iteration % 2 == 0? cuda_newTasks : cuda_tasks;
        cudaWritePivot<<<tasksAmount, 512>>>(
        cudaWritePivot<<<tasksAmount, 512>>>(
            arr, aux, desired_2ndPhasElemPerBlock,
            arr, aux, desired_2ndPhasElemPerBlock,
            task, newTask, cuda_newTasksAmount.getData(),
            task, newTask, cuda_newTasksAmount.getData(),
            cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());
            cuda_2ndPhaseTasks, cuda_2ndPhaseTasksAmount.getData());


        TNL_CHECK_CUDA_DEVICE;

        processNewTasks();
        processNewTasks();
        iteration++;
        iteration++;
    }
    }


    if((error = cudaDeviceSynchronize()) != cudaSuccess)
    cudaDeviceSynchronize();
    {
    TNL_CHECK_CUDA_DEVICE;
        deb(error);
        return;
    }
    
    
    if (tasksAmount > 0)
    if (tasksAmount > 0)
    {
    {
@@ -324,6 +325,7 @@ void QUICKSORT::sort(const Function &Cmp)
        cudaQuickSort2ndPhase<Function, 128>
        cudaQuickSort2ndPhase<Function, 128>
            <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks);
            <<<min(tasksAmount,tasks.getSize()) , threadsPerBlock>>>(arr, aux, Cmp, tasks);
    }
    }
    TNL_CHECK_CUDA_DEVICE;
    
    
    if (host_2ndPhaseTasksAmount > 0)
    if (host_2ndPhaseTasksAmount > 0)
    {
    {
@@ -331,13 +333,10 @@ void QUICKSORT::sort(const Function &Cmp)
            <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>
            <<<min(host_2ndPhaseTasksAmount,cuda_2ndPhaseTasks.getSize()) , threadsPerBlock>>>
            (arr, aux, Cmp, cuda_2ndPhaseTasks);
            (arr, aux, Cmp, cuda_2ndPhaseTasks);
    }
    }
    TNL_CHECK_CUDA_DEVICE;



    cudaDeviceSynchronize();
    if((error = cudaDeviceSynchronize()) != cudaSuccess)
    TNL_CHECK_CUDA_DEVICE;
    {
        deb(error);
        return;
    }
    return;
    return;
}
}