Skip to content
Snippets Groups Projects
bitonicSort.h 8.45 KiB
Newer Older
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include <TNL/Containers/Array.h>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#include <iostream>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

using namespace TNL;
using namespace TNL::Containers;

typedef Devices::Cuda Device;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
#define deb(x) std::cout << #x << " = " << x << std::endl;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//---------------------------------------------

__host__ __device__ int closestPow2(int x)
{
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    if (x == 0)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        return 0;

    int ret = 1;
    while (ret < x)
        ret <<= 1;

    return ret;
}

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value, typename Function>
__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Function & Cmp)
    if( (ascending == Cmp(b, a)))
        TNL::swap(a, b);
}
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//---------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
/**
 * this kernel simulates 1 exchange 
 */
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value, typename Function>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
__global__ void bitonicMergeGlobal(ArrayView<Value, Device> arr,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                                 int begin, int end, const Function & Cmp,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                                 int monotonicSeqLen, int len, int partsInSeq)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    int part = i / (len / 2); //computes which sorting block this thread belongs to
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    //the index of 2 elements that should be compared and swapped
    int s = begin + part * len + (i & ((len / 2) - 1) );
    int e = s + len / 2;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    if (e >= end) //arr[e] is virtual padding and will not be exchanged with
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    //calculate the direction of swapping
    int monotonicSeqIdx = part / partsInSeq;
    bool ascending = (monotonicSeqIdx & 1) != 0;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for part with no "partner" to be merged with in next phase
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        ascending = true;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    cmpSwap(arr[s], arr[e], ascending, Cmp);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
//---------------------------------------------
/**
 * kernel for merging if whole block fits into shared memory
 * will merge all the way down til stride == 2
 * */
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value, typename Function>
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
__global__ void bitonicMergeSharedMemory(ArrayView<Value, Device> arr,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                                         int begin, int end, const Function & Cmp,
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                                         int monotonicSeqLen, int len, int partsInSeq)
    extern __shared__ int externMem[];
    Value * sharedMem = (Value *)externMem;

    int sharedMemLen = 2*blockDim.x;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    //1st index and last index of subarray that this threadBlock should merge
    int myBlockStart = begin + blockIdx.x * sharedMemLen;
    int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    //copy from globalMem into sharedMem
    int copy1 = myBlockStart + threadIdx.x;
    int copy2 = copy1 + blockDim.x;
        if(copy1 < end)
            sharedMem[threadIdx.x] = arr[copy1];
        if(copy2 < end)
            sharedMem[threadIdx.x + blockDim.x] = arr[copy2];
        __syncthreads();
    }
    //------------------------------------------
    //bitonic activity
        //calculate the direction of swapping
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int part = i / (len / 2);
        int monotonicSeqIdx = part / partsInSeq;

        bool ascending = (monotonicSeqIdx & 1) != 0;
        //special case for parts with no "partner"
        if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            ascending = true;
        //------------------------------------------

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
        //do bitonic merge
        for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
        {
            __syncthreads();
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            //calculates which 2 indexes will be compared and swap
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            int part = threadIdx.x / (len / 2);
            int s = part * len + (threadIdx.x & ((len /2) - 1));
            int e = s + len / 2;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            if(e >= myBlockEnd - myBlockStart) //touching virtual padding
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);

        __syncthreads();

    }

    //------------------------------------------
    //writeback to global memory
        if(copy1 < end)
            arr[copy1] = sharedMem[threadIdx.x];
        if(copy2 < end)
            arr[copy2] = sharedMem[threadIdx.x + blockDim.x];
//---------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
/**
 * very similar to bitonicMergeSharedMemory
 * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen
 *  then trickles down again
 * this continues until whole sharedMem is sorted
 * */
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value, typename Function>
__global__ void bitoniSort1stStepSharedMemory(ArrayView<Value, Device> arr, int begin, int end, const Function & Cmp)
    extern __shared__ int externMem[];
    
    Value * sharedMem = (Value *)externMem;
    int sharedMemLen = 2*blockDim.x;
    int myBlockStart = begin + blockIdx.x * sharedMemLen;
    int myBlockEnd = end < myBlockStart+sharedMemLen? end : myBlockStart+sharedMemLen;

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    //copy from globalMem into sharedMem
    int copy1 = myBlockStart + threadIdx.x;
    int copy2 = copy1 + blockDim.x;
        if(copy1 < end)
            sharedMem[threadIdx.x] = arr[copy1];

        if(copy2 < end)
            sharedMem[threadIdx.x + blockDim.x] = arr[copy2];

        __syncthreads();
    }
    //------------------------------------------
    //bitonic activity
    {
        int i = blockIdx.x * blockDim.x + threadIdx.x;
        int paddedSize = closestPow2(myBlockEnd - myBlockStart);
        for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
        {
            //calculate the direction of swapping
            int monotonicSeqIdx = i / (monotonicSeqLen/2);
            bool ascending = (monotonicSeqIdx & 1) != 0;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
            if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end) //special case for parts with no "partner"
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                ascending = true;

            for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
            {
                __syncthreads();

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                //calculates which 2 indexes will be compared and swap
                int part = threadIdx.x / (len / 2);
                int s = part * len + (threadIdx.x & ((len / 2) - 1));
                int e = s + len / 2;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                if(e >= myBlockEnd - myBlockStart) //touching virtual padding
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);

            }
        }

        __syncthreads();

    }

    //------------------------------------------
    //writeback to global memory
    {
        if(copy1 < end)
            arr[copy1] = sharedMem[threadIdx.x];
        if(copy2 < end)
            arr[copy2] = sharedMem[threadIdx.x + blockDim.x];
//---------------------------------------------
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value, typename Function>
void bitonicSort(ArrayView<Value, Device> arr, int begin, int end, const Function& Cmp)
{
    int arrSize = end - begin;
    int paddedSize = closestPow2(arrSize);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    int threadsNeeded = arrSize / 2 + (arrSize %2 !=0);
    const int maxThreadsPerBlock = 512;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    int threadPerBlock = min(maxThreadsPerBlock, threadsNeeded);
    int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock == 0 ? 0 : 1);

    const int sharedMemLen = threadPerBlock * 2;
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    const int sharedMemSize = sharedMemLen* sizeof(Value);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed

    //---------------------------------------------------------------------------------

    
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    bitoniSort1stStepSharedMemory<<<blocks, threadPerBlock, sharedMemSize>>>(arr, begin, end, Cmp);
    for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    {
        for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
        {
            if(len > sharedMemLen)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                bitonicMergeGlobal<<<blocks, threadPerBlock>>>(arr, begin, end, Cmp,
                                                            monotonicSeqLen, len, partsInSeq);
            }
            else
            {

Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
                bitonicMergeSharedMemory<<<blocks, threadPerBlock, sharedMemSize>>>(arr, begin, end, Cmp,
                                                                                    monotonicSeqLen, len, partsInSeq);
                break;
            }
    cudaDeviceSynchronize();
//---------------------------------------------
template <typename Value, typename Function>
void bitonicSort(ArrayView<Value, Device> arr, const Function & cmp)
{
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
    bitonicSort(arr, 0, arr.getSize(), cmp);
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
template <typename Value>
void bitonicSort(ArrayView<Value, Device> arr)
Xuan Thang Nguyen's avatar
Xuan Thang Nguyen committed
{
    bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;});
}