Commit 70e47068 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Removing Fetch lambda function from inplace bitonic sort.

parent 95b5e99d
Loading
Loading
Loading
Loading
+9 −9
Original line number Diff line number Diff line
@@ -315,8 +315,8 @@ void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec)
//---------------------------------------------

#ifdef HAVE_CUDA
template <typename FETCH, typename CMP, typename SWAP>
__global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap,
template< typename CMP, typename SWAP>
__global__ void bitonicMergeGlobal(int size, CMP Cmp, SWAP Swap,
                                   int monotonicSeqLen, int bitonicLen)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
@@ -336,12 +336,12 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap,
    if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
        ascending = true;

    if (ascending == Cmp(Fetch(e), Fetch(s)))
    if( ascending == Cmp(e, s) )
        Swap(s, e);
}

template <typename FETCH, typename CMP, typename SWAP>
void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
template< typename CMP, typename SWAP >
void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap)
{
    int size = end - begin;
    int paddedSize = closestPow2(size);
@@ -352,9 +352,9 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
    int threadsPerBlock = maxThreadsPerBlock;
    int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0);

    auto fetchWithOffset =
        [=] __cuda_callable__(int i) {
            return Fetch(i + begin);
    auto compareWithOffset =
        [=] __cuda_callable__(int i, int j) {
            return Cmp(i + begin, j + begin);
        };

    auto swapWithOffset =
@@ -367,7 +367,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
        for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
        {
            bitonicMergeGlobal<<<blocks, threadsPerBlock>>>(
                size, fetchWithOffset, Cmp, swapWithOffset,
                size, compareWithOffset, swapWithOffset,
                monotonicSeqLen, bitonicLen);
        }
    }
+7 −7
Original line number Diff line number Diff line
@@ -314,10 +314,10 @@ TEST(sortRange, middleMultiBlock)
template<typename TYPE>
void fetchAndSwapSorter(TNL::Containers::ArrayView<TYPE, TNL::Devices::Cuda> view)
{
    auto Fetch = [=]__cuda_callable__(int i){return view[i];};
    auto Cmp = [=]__cuda_callable__(const TYPE & a, const TYPE & b){return a < b;};
    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
    auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ]  < view[ j ];};
    auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);};
    bitonicSort(0, view.getSize(), Fetch, Cmp, Swap);
    bitonicSort(0, view.getSize(), Cmp, Swap);
}

TEST(fetchAndSwap, oneBlockSort)
@@ -360,10 +360,10 @@ TEST(fetchAndSwap, typeDouble)

void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view, int from, int to)
{
    auto Fetch = [=]__cuda_callable__(int i){return view[i];};
    auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;};
    //auto Fetch = [=]__cuda_callable__(int i){return view[i];};
    auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; };
    auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); };
    bitonicSort(from, to, Fetch, Cmp, Swap);
    bitonicSort(from, to, Cmp, Swap);
}

TEST(fetchAndSwap, sortMiddle)