reformating and adding function description (08769c95) · Commits · TNL / GPUSort

src/bitonicSort/bitonicSort.h

+124 −104

Original line number	Diff line number	Diff line
		@@ -18,12 +18,15 @@ __host__ __device__ int closestPow2(int x)
		template <typename Value, typename Function>
		__host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp)
		{
		if( (ascending == Cmp(b, a)))
		if (ascending == Cmp(b, a))
		TNL::swap(a, b);
		}

		//---------------------------------------------

		/**
		* this kernel simulates 1 exchange
		* splits input arr that is bitonic into 2 bitonic sequences
		*/
		template <typename Value, typename Function>
		__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
		@@ -48,15 +51,17 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device

		cmpSwap(arr[s], arr[e], ascending, Cmp);
		}

		//---------------------------------------------
		//---------------------------------------------

		/**
		* kernel for merging if whole block fits into shared memory
		* will merge all the way down til stride == 2
		* simulates many layers of merge
		* turns input that is a bitonic sequence into 1 monotonic sequence
		*
		* this version uses shared memory to do the operations
		* */
		template <typename Value, typename Function>
		__global__
		void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
		__global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
		const Function &Cmp,
		int monotonicSeqLen, int len, int partsInSeq)
		{
		@@ -120,10 +125,14 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cu
		}
		}


		/**
		* simulates many layers of merge
		* turns input that is a bitonic sequence into 1 monotonic sequence
		*
		* this user only operates on global memory, no shared memory is used
		* */
		template <typename Value, typename Function>
		__global__
		void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
		__global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
		const Function &Cmp,
		int monotonicSeqLen, int len, int partsInSeq)
		{
		@@ -160,9 +169,15 @@ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,

		//---------------------------------------------

		/**
		* IMPORTANT: all threads in block have to call this function to work properly
		* IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted
		* Description: sorts src and writes into dst within a block
		* works independently from other concurrent blocks
		* @param sharedMem sharedMem pointer has to be able to store blockDim.x*2 elements
		* */
		template <typename Value, typename Function>
		__device__
		void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
		__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
		TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
		Value *sharedMem, const Function &Cmp)
		{
		@@ -217,9 +232,16 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src
		}
		}


		/**
		* IMPORTANT: all threads in block have to call this function to work properly
		* IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted
		* Description: sorts src and writes into dst within a block
		* works independently from other concurrent blocks
		* this version doesnt use shared memory and is prefered for Value with big size
		* */
		template <typename Value, typename Function>
		__device__
		void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
		__device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
		TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
		const Function &Cmp)
		{
		@@ -249,10 +271,9 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src
		}

		/**
		* very similar to bitonicMergeSharedMemory
		* does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen
		* then trickles down again
		* this continues until whole sharedMem is sorted
		* entrypoint for bitonicSort_Block
		* sorts @param arr in alternating order to create bitonic sequences
		* sharedMem has to be able to store at least blockDim.x*2 elements
		* */
		template <typename Value, typename Function>
		__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp)
		@@ -266,10 +287,14 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value,
		bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp);
		else
		bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem,
		[&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);}
		);
		[&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); });
		}

		/**
		* entrypoint for bitonicSort_Block
		* sorts @param arr in alternating order to create bitonic sequences
		* doesn't use shared memory
		* */
		template <typename Value, typename Function>
		__global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp)
		{
		@@ -280,8 +305,7 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices
		bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp);
		else
		bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd),
		[&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);}
		);
		[&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); });
		}

		//---------------------------------------------
		@@ -418,8 +442,6 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch,
		Swap(s, e);
		}



		template <typename FETCH, typename CMP, typename SWAP>
		void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap)
		{
		@@ -433,14 +455,12 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap)
		int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);

		auto fetchWithOffset =
		[=] __cuda_callable__(int i)
		{
		[=] __cuda_callable__(int i) {
		return Fetch(i + begin);
		};

		auto swapWithOffset =
		[=] __cuda_callable__(int i, int j) mutable
		{
		[=] __cuda_callable__(int i, int j) mutable {
		Swap(i + begin, j + begin);
		};