bitonic sorter inplace with fetch and swap, need to implement tests (d4e46a7e) · Commits · TNL / GPUSort

src/bitonicSort/bitonicSort.h

+64 −1

Original line number	Diff line number	Diff line
		@@ -293,3 +293,66 @@ void bitonicSort(std::vector<Value> & vec)
		}

		//---------------------------------------------
		//---------------------------------------------

		template <typename FETCH, typename CMP, typename SWAP>
		__global__ void bitonicMergeGlobal(int size, const FETCH & Fetch,
		const CMP & Cmp, const SWAP & Swap,
		int monotonicSeqLen, int len, int partsInSeq)
		{
		int i = blockIdx.x * blockDim.x + threadIdx.x;

		int part = i / (len / 2); //computes which sorting block this thread belongs to

		//the index of 2 elements that should be compared and swapped
		int s = part * len + (i & ((len / 2) - 1) );
		int e = s + len / 2;
		if (e >= size) //arr[e] is virtual padding and will not be exchanged with
		return;

		//calculate the direction of swapping
		int monotonicSeqIdx = part / partsInSeq;
		bool ascending = (monotonicSeqIdx & 1) != 0;
		if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
		ascending = true;

		if( (ascending == Cmp(Fetch(e), Fetch(s))))
		Swap(s, e);
		}



		template <typename FETCH, typename CMP, typename SWAP>
		void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap)
		{
		int size = end - begin;
		int paddedSize = closestPow2(size);

		int threadsNeeded = size / 2 + (size %2 !=0);

		const int maxThreadsPerBlock = 512;
		int threadPerBlock = maxThreadsPerBlock;
		int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);

		auto fetchWithOffset =
		[=] __cuda_callable__(int i)
		{
		return Fetch(i + begin);
		};

		auto swapWithOffset =
		[=] __cuda_callable__(int i, int j)
		{
		return Swap(i+begin, i+begin);
		};

		for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
		{
		for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
		{
		bitonicMergeGlobal<<<blocks, threadPerBlock>>>(
		size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq);
		}
		}
		cudaDeviceSynchronize();
		}
		No newline at end of file

tests/bitonic_tests/unitTests.cu

+32 −0

Original line number	Diff line number	Diff line
		@@ -239,7 +239,39 @@ TEST(sortRange, middleMultiBlock)
		ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1);
		ASSERT_TRUE(arr.back() == -1);
		}
		/*
		void fetchAndSwapSorter(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view)
		{

		//auto Fetch = [=]__cuda_callable__(int i){return view[i];};
		//auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;};
		//auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);};
		//bitonicSort(0, view.getSize(), Fetch, Cmp, Swap);

		}

		TEST(fetchAndSwap, oneBlockSort)
		{
		int size = 9;
		const int stride = 227;
		int i = 0;

		std::vector<int> orig(size);
		std::iota(orig.begin(), orig.end(), 0);

		do
		{
		if ((i++) % stride != 0)
		continue;

		TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig);
		auto view = cudaArr.getView();
		fetchAndSwapSorter(view);
		ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl;
		}
		while (std::next_permutation(orig.begin(), orig.end()));
		}
		*/

		//----------------------------------------------------------------------------------