Loading src/bitonicSort/bitonicSort.h +124 −104 Original line number Diff line number Diff line Loading @@ -18,12 +18,15 @@ __host__ __device__ int closestPow2(int x) template <typename Value, typename Function> __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) { if( (ascending == Cmp(b, a))) if (ascending == Cmp(b, a)) TNL::swap(a, b); } //--------------------------------------------- /** * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences */ template <typename Value, typename Function> __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, Loading @@ -48,15 +51,17 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device cmpSwap(arr[s], arr[e], ascending, Cmp); } //--------------------------------------------- //--------------------------------------------- /** * kernel for merging if whole block fits into shared memory * will merge all the way down til stride == 2 * simulates many layers of merge * turns input that is a bitonic sequence into 1 monotonic sequence * * this version uses shared memory to do the operations * */ template <typename Value, typename Function> __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, int monotonicSeqLen, int len, int partsInSeq) { Loading Loading @@ -120,10 +125,14 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cu } } /** * simulates many layers of merge * turns input that is a bitonic sequence into 1 monotonic sequence * * this user only operates on global memory, no shared memory is used * */ template <typename Value, typename Function> __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, int monotonicSeqLen, int len, int partsInSeq) { Loading Loading @@ -160,9 +169,15 @@ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, //--------------------------------------------- /** * IMPORTANT: all threads in block have to call this function to work properly * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted * Description: sorts src and writes into dst within a block * works independently from other concurrent blocks * @param sharedMem sharedMem pointer has to be able to store blockDim.x*2 elements * */ template <typename Value, typename Function> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, Value *sharedMem, const Function &Cmp) { Loading Loading @@ -217,9 +232,16 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src } } /** * IMPORTANT: all threads in block have to call this function to work properly * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted * Description: sorts src and writes into dst within a block * works independently from other concurrent blocks * this version doesnt use shared memory and is prefered for Value with big size * */ template <typename Value, typename Function> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, const Function &Cmp) { Loading Loading @@ -249,10 +271,9 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src } /** * very similar to bitonicMergeSharedMemory * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen * then trickles down again * this continues until whole sharedMem is sorted * entrypoint for bitonicSort_Block * sorts @param arr in alternating order to create bitonic sequences * sharedMem has to be able to store at least blockDim.x*2 elements * */ template <typename Value, typename Function> __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) Loading @@ -266,10 +287,14 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp); else bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} ); [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } /** * entrypoint for bitonicSort_Block * sorts @param arr in alternating order to create bitonic sequences * doesn't use shared memory * */ template <typename Value, typename Function> __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) { Loading @@ -280,8 +305,7 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp); else bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} ); [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } //--------------------------------------------- Loading Loading @@ -418,8 +442,6 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, Swap(s, e); } template <typename FETCH, typename CMP, typename SWAP> void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) { Loading @@ -433,14 +455,12 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap) int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); auto fetchWithOffset = [=] __cuda_callable__(int i) { [=] __cuda_callable__(int i) { return Fetch(i + begin); }; auto swapWithOffset = [=] __cuda_callable__(int i, int j) mutable { [=] __cuda_callable__(int i, int j) mutable { Swap(i + begin, j + begin); }; Loading Loading
src/bitonicSort/bitonicSort.h +124 −104 Original line number Diff line number Diff line Loading @@ -18,12 +18,15 @@ __host__ __device__ int closestPow2(int x) template <typename Value, typename Function> __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) { if( (ascending == Cmp(b, a))) if (ascending == Cmp(b, a)) TNL::swap(a, b); } //--------------------------------------------- /** * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences */ template <typename Value, typename Function> __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, Loading @@ -48,15 +51,17 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device cmpSwap(arr[s], arr[e], ascending, Cmp); } //--------------------------------------------- //--------------------------------------------- /** * kernel for merging if whole block fits into shared memory * will merge all the way down til stride == 2 * simulates many layers of merge * turns input that is a bitonic sequence into 1 monotonic sequence * * this version uses shared memory to do the operations * */ template <typename Value, typename Function> __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, int monotonicSeqLen, int len, int partsInSeq) { Loading Loading @@ -120,10 +125,14 @@ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cu } } /** * simulates many layers of merge * turns input that is a bitonic sequence into 1 monotonic sequence * * this user only operates on global memory, no shared memory is used * */ template <typename Value, typename Function> __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, int monotonicSeqLen, int len, int partsInSeq) { Loading Loading @@ -160,9 +169,15 @@ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, //--------------------------------------------- /** * IMPORTANT: all threads in block have to call this function to work properly * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted * Description: sorts src and writes into dst within a block * works independently from other concurrent blocks * @param sharedMem sharedMem pointer has to be able to store blockDim.x*2 elements * */ template <typename Value, typename Function> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, Value *sharedMem, const Function &Cmp) { Loading Loading @@ -217,9 +232,16 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src } } /** * IMPORTANT: all threads in block have to call this function to work properly * IMPORTANT: input can be at max size of blockDim.x*2, bigger size will lead to part of input unsorted * Description: sorts src and writes into dst within a block * works independently from other concurrent blocks * this version doesnt use shared memory and is prefered for Value with big size * */ template <typename Value, typename Function> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, const Function &Cmp) { Loading Loading @@ -249,10 +271,9 @@ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src } /** * very similar to bitonicMergeSharedMemory * does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen * then trickles down again * this continues until whole sharedMem is sorted * entrypoint for bitonicSort_Block * sorts @param arr in alternating order to create bitonic sequences * sharedMem has to be able to store at least blockDim.x*2 elements * */ template <typename Value, typename Function> __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) Loading @@ -266,10 +287,14 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, Cmp); else bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value *)externMem, [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} ); [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } /** * entrypoint for bitonicSort_Block * sorts @param arr in alternating order to create bitonic sequences * doesn't use shared memory * */ template <typename Value, typename Function> __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) { Loading @@ -280,8 +305,7 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), Cmp); else bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), [&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);} ); [&] __cuda_callable__(const Value &a, const Value &b) { return Cmp(b, a); }); } //--------------------------------------------- Loading Loading @@ -418,8 +442,6 @@ __global__ void bitonicMergeGlobal(int size, FETCH Fetch, Swap(s, e); } template <typename FETCH, typename CMP, typename SWAP> void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) { Loading @@ -433,14 +455,12 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap) int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); auto fetchWithOffset = [=] __cuda_callable__(int i) { [=] __cuda_callable__(int i) { return Fetch(i + begin); }; auto swapWithOffset = [=] __cuda_callable__(int i, int j) mutable { [=] __cuda_callable__(int i, int j) mutable { Swap(i + begin, j + begin); }; Loading