Commit 08769c95 authored by Xuan Thang Nguyen's avatar Xuan Thang Nguyen
Browse files

reformating and adding function description

parent 3717d64a
Loading
Loading
Loading
Loading
+124 −104
Original line number Diff line number Diff line
@@ -18,12 +18,15 @@ __host__ __device__ int closestPow2(int x)
template <typename Value, typename Function>
__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp)
{
    if( (ascending == Cmp(b, a)))
    if (ascending == Cmp(b, a))
        TNL::swap(a, b);
}

//---------------------------------------------

/**
 * this kernel simulates 1 exchange 
 * splits input arr that is bitonic into 2 bitonic sequences
 */
template <typename Value, typename Function>
__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
@@ -48,15 +51,17 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device

    cmpSwap(arr[s], arr[e], ascending, Cmp);
}

//---------------------------------------------
//---------------------------------------------

/**
 * kernel for merging if whole block fits into shared memory
 * will merge all the way down til stride == 2
 * simulates many layers of merge
 * turns input that is a bitonic sequence into 1 monotonic sequence
 * 
 * this version uses shared memory to do the operations
 * */
template <typename Value, typename Function>
__global__
void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
                                         const Function &Cmp,
                                         int monotonicSeqLen, int len, int partsInSeq)
{
@@ -120,10 +125,14 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cu
    }
}


/**
 * simulates many layers of merge
 * turns input that is a bitonic sequence into 1 monotonic sequence
 * 
 * this user only operates on global memory, no shared memory is used
 * */
template <typename Value, typename Function>
__global__
void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
__global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
                             const Function &Cmp,
                             int monotonicSeqLen, int len, int partsInSeq)
{
@@ -160,9 +169,15 @@ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,

//---------------------------------------------

/**
 * IMPORTANT: all threads in block have to call this function to work properly
 * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted
 * Description: sorts src and writes into dst within a block
 * works independently from other concurrent blocks
 * @param sharedMem sharedMem pointer has to be able to store blockDim.x*2 elements
 * */
template <typename Value, typename Function>
__device__
void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, 
__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
                                  TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
                                  Value *sharedMem, const Function &Cmp)
{
@@ -217,9 +232,16 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src
    }
}


/**
 * IMPORTANT: all threads in block have to call this function to work properly
 * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted
 * Description: sorts src and writes into dst within a block
 * works independently from other concurrent blocks
 * this version doesnt use shared memory and is prefered for Value with big size
 * */
template <typename Value, typename Function>
__device__
void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, 
__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
                                  TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
                                  const Function &Cmp)
{
@@ -249,10 +271,9 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src
}

/**
 * very similar to bitonicMergeSharedMemory
 * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen
 *  then trickles down again
 * this continues until whole sharedMem is sorted
 * entrypoint for bitonicSort_Block
 * sorts @param arr in alternating order to create bitonic sequences
 * sharedMem has to be able to store at least blockDim.x*2 elements
 * */
template <typename Value, typename Function>
__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp)
@@ -266,10 +287,14 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value,
        bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp);
    else
        bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem,
            [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);}
        );
                          [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); });
}

/**
 * entrypoint for bitonicSort_Block
 * sorts @param arr in alternating order to create bitonic sequences
 * doesn't use shared memory
 * */
template <typename Value, typename Function>
__global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp)
{
@@ -280,8 +305,7 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices
        bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp);
    else
        bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd),
            [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);}
        );
                          [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); });
}

//---------------------------------------------
@@ -418,8 +442,6 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch,
        Swap(s, e);
}



template <typename FETCH, typename CMP, typename SWAP>
void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
{
@@ -433,14 +455,12 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap)
    int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);

    auto fetchWithOffset =
        [=] __cuda_callable__(int i)
        {
        [=] __cuda_callable__(int i) {
            return Fetch(i + begin);
        };

    auto swapWithOffset =
        [=] __cuda_callable__(int i, int j) mutable
        {
        [=] __cuda_callable__(int i, int j) mutable {
            Swap(i + begin, j + begin);
        };