Commit d4e46a7e authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

bitonic sorter inplace with fetch and swap, need to implement tests

parent a2f401f2
Loading
Loading
Loading
Loading
+64 −1
Original line number Diff line number Diff line
@@ -293,3 +293,66 @@ void bitonicSort(std::vector<Value> & vec)
}

//---------------------------------------------
//---------------------------------------------

template <typename FETCH, typename CMP,  typename SWAP>
__global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, 
                                 const CMP & Cmp, const SWAP & Swap,
                                 int monotonicSeqLen, int len, int partsInSeq)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    int part = i / (len / 2); //computes which sorting block this thread belongs to

    //the index of 2 elements that should be compared and swapped
    int s = part * len + (i & ((len / 2) - 1) );
    int e = s + len / 2;
    if (e >= size) //arr[e] is virtual padding and will not be exchanged with
        return;

    //calculate the direction of swapping
    int monotonicSeqIdx = part / partsInSeq;
    bool ascending = (monotonicSeqIdx & 1) != 0;
    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
        ascending = true;

    if( (ascending == Cmp(Fetch(e), Fetch(s))))
        Swap(s, e);
}



template <typename FETCH, typename CMP,  typename SWAP>
void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap)
{
    int size = end - begin;
    int paddedSize = closestPow2(size);

    int threadsNeeded = size / 2 + (size %2 !=0);

    const int maxThreadsPerBlock = 512;
    int threadPerBlock = maxThreadsPerBlock;
    int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);

    auto fetchWithOffset = 
        [=] __cuda_callable__(int i)
        {
            return Fetch(i + begin);
        };
        
    auto swapWithOffset = 
        [=] __cuda_callable__(int i, int j)
        {
            return Swap(i+begin, i+begin);
        };

    for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
    {
        for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
        {
            bitonicMergeGlobal<<<blocks, threadPerBlock>>>(
                size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq);
        }
    }
    cudaDeviceSynchronize();
}
 No newline at end of file
+32 −0
Original line number Diff line number Diff line
@@ -239,7 +239,39 @@ TEST(sortRange, middleMultiBlock)
    ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1);
    ASSERT_TRUE(arr.back() == -1); 
}
/*
void fetchAndSwapSorter(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view)
{
    
    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
    //auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;};
    //auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);};
    //bitonicSort(0, view.getSize(), Fetch, Cmp, Swap);
    
}

TEST(fetchAndSwap, oneBlockSort)
{
    int size = 9;
    const int stride = 227;
    int i = 0;

    std::vector<int> orig(size);
    std::iota(orig.begin(), orig.end(), 0);

    do
    {
        if ((i++) % stride != 0)
            continue;

        TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
        auto view = cudaArr.getView();
        fetchAndSwapSorter(view);
        ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
    }
    while (std::next_permutation(orig.begin(), orig.end()));
}
*/

//----------------------------------------------------------------------------------