Commit f0926de3 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Refactored reduction and scan in quicksort implementation

parent 1a64a618
Loading
Loading
Loading
Loading
+1 −2
Original line number Diff line number Diff line
@@ -13,7 +13,6 @@
#pragma once

#include <TNL/Containers/Array.h>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Sorting/detail/task.h>

namespace TNL {
@@ -94,7 +93,7 @@ class Quicksorter< Value, Devices::Cuda >
      Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount;  //is in reality 1 integer each

      Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping;
      Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem;
      Containers::Array<int, Devices::Cuda> cuda_reductionTaskInitMem;

      int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0;
      int iteration = 0;
+4 −15
Original line number Diff line number Diff line
@@ -17,7 +17,8 @@
#include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h>
#include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
#include <TNL/Algorithms/Sorting/detail/Quicksorter.h>
#include <TNL/Algorithms/Scan.h>
#include <TNL/Algorithms/reduce.h>
#include <TNL/Algorithms/scan.h>

namespace TNL {
    namespace Algorithms {
@@ -314,8 +315,7 @@ int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::C
        int size = task.partitionEnd - task.partitionBegin;
        return size / elemPerBlock + (size % elemPerBlock != 0);
    };
    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
    return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 );
    return reduce< Devices::Cuda >( 0, quicksort.host_1stPhaseTasksAmount, fetch, TNL::Plus{} );
}

template< typename Value >
@@ -323,14 +323,6 @@ int
Quicksorter< Value, Devices::Cuda >::
getSetsNeeded(int elemPerBlock) const
{
    /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView();
    auto fetch = [=] __cuda_callable__(int i) {
        const auto &task = view[i];
        int size = task.partitionEnd - task.partitionBegin;
        return size / elemPerBlock + (size % elemPerBlock != 0);
    };
    auto reduction = [] __cuda_callable__(int a, int b) { return a + b; };
    return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/
    return getSetsNeededFunction< Value >( elemPerBlock, *this );
}

@@ -372,10 +364,7 @@ initTasks(int elemPerBlock, const CMP &Cmp)
                                                      cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount));
    //cuda_reductionTaskInitMem[i] == how many blocks task i needs

    //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; };

    Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >::
        perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0);
    inplaceInclusiveScan(cuda_reductionTaskInitMem);
    //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need

    int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1);
+6 −3
Original line number Diff line number Diff line
@@ -13,8 +13,8 @@
#pragma once

#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/Sorting/detail/reduction.h>
#include <TNL/Algorithms/Sorting/detail/task.h>
#include <TNL/Algorithms/detail/CudaScanKernel.h>

namespace TNL {
   namespace Algorithms {
@@ -185,8 +185,11 @@ __device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src,
    int smaller = 0, bigger = 0;
    countElem(srcView, Cmp, smaller, bigger, pivot);

    int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
    int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
    //synchronization is in this function already
    using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
    __shared__ typename BlockScan::Storage storage;
    int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
    int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );

    if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values
    {
+5 −4
Original line number Diff line number Diff line
@@ -15,8 +15,7 @@
#include <TNL/Containers/Array.h>
#include "cassert"
#include <TNL/Algorithms/Sorting/detail/bitonicSort.h>
#include <TNL/Algorithms/Sorting/detail/reduction.h>
#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
#include <TNL/Algorithms/detail/CudaScanKernel.h>

namespace TNL {
    namespace Algorithms {
@@ -134,8 +133,10 @@ __device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices:
        countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot);

        //synchronization is in this function already
        int smallerPrefSumInc = blockInclusivePrefixSum(smaller);
        int biggerPrefSumInc = blockInclusivePrefixSum(bigger);
        using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >;
        __shared__ typename BlockScan::Storage storage;
        int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage );
        int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage );

        if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src
        {
+2 −4
Original line number Diff line number Diff line
@@ -13,8 +13,6 @@
#pragma once

#include <TNL/Containers/Array.h>
#include <TNL/Containers/Vector.h>
#include <TNL/Algorithms/Sorting/detail/reduction.h>
#include <TNL/Algorithms/Sorting/detail/task.h>
#include <TNL/Algorithms/Sorting/detail/cudaPartition.h>
#include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h>
@@ -33,7 +31,7 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n
//-----------------------------------------------------------

__global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock,
                                     Containers::VectorView<int, Devices::Cuda> blocksNeeded)
                                     Containers::ArrayView<int, Devices::Cuda> blocksNeeded)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= cuda_tasks.getSize())
@@ -49,7 +47,7 @@ __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda>
template <typename Value, typename CMP>
__global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks,
                             Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping,
                             Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem,
                             Containers::ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem,
                             Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp)
{
    if (blockIdx.x >= cuda_tasks.getSize())