Removing Fetch lambda function from inplace bitonic sort. (70e47068) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Sorting/detail/bitonicSort.h

+9 −9

Original line number	Diff line number	Diff line
		@@ -315,8 +315,8 @@ void bitonicSort( TNL::Containers::Array< Value, TNL::Devices::Host > &vec)
		//---------------------------------------------

		#ifdef HAVE_CUDA
		template <typename FETCH, typename CMP, typename SWAP>
		__global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap,
		template< typename CMP, typename SWAP>
		__global__ void bitonicMergeGlobal(int size, CMP Cmp, SWAP Swap,
		int monotonicSeqLen, int bitonicLen)
		{
		int i = blockIdx.x * blockDim.x + threadIdx.x;
		@@ -336,12 +336,12 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap,
		if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
		ascending = true;

		if (ascending == Cmp(Fetch(e), Fetch(s)))
		if( ascending == Cmp(e, s) )
		Swap(s, e);
		}

		template <typename FETCH, typename CMP, typename SWAP>
		void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
		template< typename CMP, typename SWAP >
		void bitonicSort(int begin, int end, const CMP &Cmp, SWAP Swap)
		{
		int size = end - begin;
		int paddedSize = closestPow2(size);
		@@ -352,9 +352,9 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
		int threadsPerBlock = maxThreadsPerBlock;
		int blocks = threadsNeeded / threadsPerBlock + (threadsNeeded % threadsPerBlock != 0);

		auto fetchWithOffset =
		[=] __cuda_callable__(int i) {
		return Fetch(i + begin);
		auto compareWithOffset =
		[=] __cuda_callable__(int i, int j) {
		return Cmp(i + begin, j + begin);
		};

		auto swapWithOffset =
		@@ -367,7 +367,7 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
		for (int bitonicLen = monotonicSeqLen; bitonicLen > 1; bitonicLen /= 2)
		{
		bitonicMergeGlobal<<<blocks, threadsPerBlock>>>(
		size, fetchWithOffset, Cmp, swapWithOffset,
		size, compareWithOffset, swapWithOffset,
		monotonicSeqLen, bitonicLen);
		}
		}

src/UnitTests/Algorithms/Sorting/BitonicSortTest.h

+7 −7

Original line number	Diff line number	Diff line
		@@ -314,10 +314,10 @@ TEST(sortRange, middleMultiBlock)
		template<typename TYPE>
		void fetchAndSwapSorter(TNL::Containers::ArrayView<TYPE, TNL::Devices::Cuda> view)
		{
		auto Fetch = [=]__cuda_callable__(int i){return view[i];};
		auto Cmp = [=]__cuda_callable__(const TYPE & a, const TYPE & b){return a < b;};
		//auto Fetch = [=]__cuda_callable__(int i){return view[i];};
		auto Cmp = [=]__cuda_callable__(const int i, const int j ){return view[ i ] < view[ j ];};
		auto Swap = [=] __cuda_callable__ (int i, int j) mutable {TNL::swap(view[i], view[j]);};
		bitonicSort(0, view.getSize(), Fetch, Cmp, Swap);
		bitonicSort(0, view.getSize(), Cmp, Swap);
		}

		TEST(fetchAndSwap, oneBlockSort)
		@@ -360,10 +360,10 @@ TEST(fetchAndSwap, typeDouble)

		void fetchAndSwap_sortMiddle(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view, int from, int to)
		{
		auto Fetch = [=]__cuda_callable__(int i){return view[i];};
		auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;};
		//auto Fetch = [=]__cuda_callable__(int i){return view[i];};
		auto Cmp = [=]__cuda_callable__(const int i, const int j ){ return view[ i ] < view[ j ]; };
		auto Swap = [=] __cuda_callable__ (int i, int j) mutable { TNL::swap(view[i], view[j]); };
		bitonicSort(from, to, Fetch, Cmp, Swap);
		bitonicSort(from, to, Cmp, Swap);
		}

		TEST(fetchAndSwap, sortMiddle)