Loading src/bitonicSort/bitonicSort.h +64 −1 Original line number Diff line number Diff line Loading @@ -293,3 +293,66 @@ void bitonicSort(std::vector<Value> & vec) } //--------------------------------------------- //--------------------------------------------- template <typename FETCH, typename CMP, typename SWAP> __global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, const CMP & Cmp, const SWAP & Swap, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; int part = i / (len / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped int s = part * len + (i & ((len / 2) - 1) ); int e = s + len / 2; if (e >= size) //arr[e] is virtual padding and will not be exchanged with return; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase ascending = true; if( (ascending == Cmp(Fetch(e), Fetch(s)))) Swap(s, e); } template <typename FETCH, typename CMP, typename SWAP> void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap) { int size = end - begin; int paddedSize = closestPow2(size); int threadsNeeded = size / 2 + (size %2 !=0); const int maxThreadsPerBlock = 512; int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); auto fetchWithOffset = [=] __cuda_callable__(int i) { return Fetch(i + begin); }; auto swapWithOffset = [=] __cuda_callable__(int i, int j) { return Swap(i+begin, i+begin); }; for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { bitonicMergeGlobal<<<blocks, threadPerBlock>>>( size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); } } cudaDeviceSynchronize(); } No newline at end of file tests/bitonic_tests/unitTests.cu +32 −0 Original line number Diff line number Diff line Loading @@ -239,7 +239,39 @@ TEST(sortRange, middleMultiBlock) ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); ASSERT_TRUE(arr.back() == -1); } /* void fetchAndSwapSorter(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view) { //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; //auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; //auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);}; //bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); } TEST(fetchAndSwap, oneBlockSort) { int size = 9; const int stride = 227; int i = 0; std::vector<int> orig(size); std::iota(orig.begin(), orig.end(), 0); do { if ((i++) % stride != 0) continue; TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } */ //---------------------------------------------------------------------------------- Loading Loading
src/bitonicSort/bitonicSort.h +64 −1 Original line number Diff line number Diff line Loading @@ -293,3 +293,66 @@ void bitonicSort(std::vector<Value> & vec) } //--------------------------------------------- //--------------------------------------------- template <typename FETCH, typename CMP, typename SWAP> __global__ void bitonicMergeGlobal(int size, const FETCH & Fetch, const CMP & Cmp, const SWAP & Swap, int monotonicSeqLen, int len, int partsInSeq) { int i = blockIdx.x * blockDim.x + threadIdx.x; int part = i / (len / 2); //computes which sorting block this thread belongs to //the index of 2 elements that should be compared and swapped int s = part * len + (i & ((len / 2) - 1) ); int e = s + len / 2; if (e >= size) //arr[e] is virtual padding and will not be exchanged with return; //calculate the direction of swapping int monotonicSeqIdx = part / partsInSeq; bool ascending = (monotonicSeqIdx & 1) != 0; if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase ascending = true; if( (ascending == Cmp(Fetch(e), Fetch(s)))) Swap(s, e); } template <typename FETCH, typename CMP, typename SWAP> void bitonicSort(int begin, int end, const FETCH & Fetch, const CMP& Cmp, const SWAP & Swap) { int size = end - begin; int paddedSize = closestPow2(size); int threadsNeeded = size / 2 + (size %2 !=0); const int maxThreadsPerBlock = 512; int threadPerBlock = maxThreadsPerBlock; int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0); auto fetchWithOffset = [=] __cuda_callable__(int i) { return Fetch(i + begin); }; auto swapWithOffset = [=] __cuda_callable__(int i, int j) { return Swap(i+begin, i+begin); }; for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2) { for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { bitonicMergeGlobal<<<blocks, threadPerBlock>>>( size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); } } cudaDeviceSynchronize(); } No newline at end of file
tests/bitonic_tests/unitTests.cu +32 −0 Original line number Diff line number Diff line Loading @@ -239,7 +239,39 @@ TEST(sortRange, middleMultiBlock) ASSERT_TRUE(arr[e + (std::rand() % (size - e))] == -1); ASSERT_TRUE(arr.back() == -1); } /* void fetchAndSwapSorter(TNL::Containers::ArrayView<int, TNL::Devices::Cuda> view) { //auto Fetch = [=]__cuda_callable__(int i){return view[i];}; //auto Cmp = [=]__cuda_callable__(const int & a, const int & b){return a < b;}; //auto Swap = [=] __device__ (int i, int j){TNL::swap(view[i], view[j]);}; //bitonicSort(0, view.getSize(), Fetch, Cmp, Swap); } TEST(fetchAndSwap, oneBlockSort) { int size = 9; const int stride = 227; int i = 0; std::vector<int> orig(size); std::iota(orig.begin(), orig.end(), 0); do { if ((i++) % stride != 0) continue; TNL::Containers::Array<int, TNL::Devices::Cuda> cudaArr(orig); auto view = cudaArr.getView(); fetchAndSwapSorter(view); ASSERT_TRUE(is_sorted(view)) << "result " << view << std::endl; } while (std::next_permutation(orig.begin(), orig.end())); } */ //---------------------------------------------------------------------------------- Loading