Newer
Older
#include <TNL/Containers/Array.h>
//---------------------------------------------
__host__ __device__ int closestPow2(int x)
{
return 0;
int ret = 1;
while (ret < x)
ret <<= 1;
return ret;
}
template <typename Value, typename Function>
__host__ __device__ void cmpSwap(Value & a, Value &b, bool ascending, const Function & Cmp)
if( (ascending == Cmp(b, a)))
__global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
int i = blockIdx.x * blockDim.x + threadIdx.x;
int part = i / (len / 2); //computes which sorting block this thread belongs to
//the index of 2 elements that should be compared and swapped
if (e >= arr.getSize()) //arr[e] is virtual padding and will not be exchanged with
//calculate the direction of swapping
int monotonicSeqIdx = part / partsInSeq;
bool ascending = (monotonicSeqIdx & 1) != 0;
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize()) //special case for part with no "partner" to be merged with in next phase
//---------------------------------------------
/**
* kernel for merging if whole block fits into shared memory
* will merge all the way down til stride == 2
* */
__global__
void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr,
const Function & Cmp,
int monotonicSeqLen, int len, int partsInSeq)
extern __shared__ int externMem[];
Value * sharedMem = (Value *)externMem;
int sharedMemLen = 2*blockDim.x;
//1st index and last index of subarray that this threadBlock should merge
int myBlockStart = blockIdx.x * sharedMemLen;
int myBlockEnd = TNL::min(arr.getSize(), myBlockStart+sharedMemLen);
int copy1 = myBlockStart + threadIdx.x;
int copy2 = copy1 + blockDim.x;
sharedMem[threadIdx.x] = arr[copy1];
sharedMem[threadIdx.x + blockDim.x] = arr[copy2];
//calculate the direction of swapping
int i = blockIdx.x * blockDim.x + threadIdx.x;
int part = i / (len / 2);
int monotonicSeqIdx = part / partsInSeq;
bool ascending = (monotonicSeqIdx & 1) != 0;
//special case for parts with no "partner"
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= arr.getSize())
//------------------------------------------
for (; len > 1; len /= 2)
int s = part * len + (threadIdx.x & ((len /2) - 1));
if(e < myBlockEnd - myBlockStart) //not touching virtual padding
cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);
}
}
//------------------------------------------
arr[copy1] = sharedMem[threadIdx.x];
arr[copy2] = sharedMem[threadIdx.x + blockDim.x];
//---------------------------------------------
void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
Value* sharedMem, const Function & Cmp)
int copy2 = copy1 + blockDim.x;
if(copy1 < src.getSize())
sharedMem[copy1] = src[copy1];
if(copy2 < src.getSize())
sharedMem[copy2] = src[copy2];
//------------------------------------------
//bitonic activity
{
int paddedSize = closestPow2(src.getSize());
for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
{
//calculate the direction of swapping
int monotonicSeqIdx = i / (monotonicSeqLen/2);
bool ascending = (monotonicSeqIdx & 1) != 0;
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner"
for (int len = monotonicSeqLen; len > 1; len /= 2)
int part = i / (len / 2);
int s = part * len + (i & ((len / 2) - 1));
if(e < src.getSize()) //not touching virtual padding
cmpSwap(sharedMem[s], sharedMem[e], ascending, Cmp);
}
}
}
//------------------------------------------
//writeback to global memory
{
if(copy1 < src.getSize())
dst[copy1] = sharedMem[copy1];
if(copy2 < src.getSize())
dst[copy2] = sharedMem[copy2];
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
template <typename Value, typename Function>
__device__
void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src,
TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst,
const Function & Cmp)
{
int i = threadIdx.x;
int paddedSize = closestPow2(src.getSize());
for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
{
//calculate the direction of swapping
int monotonicSeqIdx = i / (monotonicSeqLen/2);
bool ascending = (monotonicSeqIdx & 1) != 0;
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= src.getSize()) //special case for parts with no "partner"
ascending = true;
for (int len = monotonicSeqLen; len > 1; len /= 2)
{
//calculates which 2 indexes will be compared and swap
int part = i / (len / 2);
int s = part * len + (i & ((len / 2) - 1));
int e = s + len / 2;
if(e < src.getSize()) //not touching virtual padding
cmpSwap(src[s], src[e], ascending, Cmp);
__syncthreads();
}
}
}
/**
* very similar to bitonicMergeSharedMemory
* does bitonicMergeSharedMemory but afterwards increases monotoncSeqLen
* then trickles down again
* this continues until whole sharedMem is sorted
* */
template <typename Value, typename Function>
__global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function & Cmp)
{
extern __shared__ int externMem[];
int sharedMemLen = 2*blockDim.x;
int myBlockStart = blockIdx.x * sharedMemLen;
int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + sharedMemLen);
if(blockIdx.x%2 || blockIdx.x + 1 == gridDim.x)
bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value*) externMem, Cmp);
bitonicSort_Block(arr.getView(myBlockStart, myBlockEnd), arr.getView(myBlockStart, myBlockEnd), (Value*) externMem,
[&] __cuda_callable__ (const Value&a, const Value&b){return Cmp(b, a);}
//---------------------------------------------
void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const Function& Cmp)
TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr = src.getView(begin, end);
int paddedSize = closestPow2(arr.getSize());
int threadsNeeded = arr.getSize() / 2 + (arr.getSize() %2 !=0);
int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);
const int sharedMemLen = threadPerBlock * 2;
const int sharedMemSize = sharedMemLen* sizeof(Value);
//---------------------------------------------------------------------------------
bitoniSort1stStepSharedMemory<<<blocks, threadPerBlock, sharedMemSize>>>(arr, Cmp);
for (int monotonicSeqLen = 2*sharedMemLen; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
{
for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
{
if(len > sharedMemLen)
bitonicMergeGlobal<<<blocks, threadPerBlock>>>(
arr, Cmp, monotonicSeqLen, len, partsInSeq);
bitonicMergeSharedMemory<<<blocks, threadPerBlock, sharedMemSize>>>(
arr, Cmp, monotonicSeqLen, len, partsInSeq);
//---------------------------------------------
template <typename Value, typename Function>
void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end)
bitonicSort(arr, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;});
}
template <typename Value, typename Function>
void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function & Cmp)
{
bitonicSort(arr, 0, arr.getSize(), Cmp);
void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr)
bitonicSort(arr, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;});
}
//---------------------------------------------
template <typename Value, typename Function>
void bitonicSort(std::vector<Value> & vec, int begin, int end, const Function & Cmp)
TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec);
auto view = Arr.getView();
TNL::Algorithms::MultiDeviceMemoryOperations<TNL::Devices::Host, TNL::Devices::Cuda >::
copy(vec.data(), view.getData(), view.getSize());
template <typename Value>
void bitonicSort(std::vector<Value> & vec, int begin, int end)
{
bitonicSort(vec, begin, end, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;});
}
template <typename Value, typename Function>
void bitonicSort(std::vector<Value> & vec, const Function & Cmp)
{
bitonicSort(vec, 0, vec.size(), Cmp);
template <typename Value>
void bitonicSort(std::vector<Value> & vec)
bitonicSort(vec, [] __cuda_callable__ (const Value & a, const Value & b) {return a < b;});
//---------------------------------------------
//---------------------------------------------
template <typename FETCH, typename CMP, typename SWAP>
__global__ void bitonicMergeGlobal(int size, FETCH Fetch,
const CMP & Cmp, SWAP Swap,
int monotonicSeqLen, int len, int partsInSeq)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int part = i / (len / 2); //computes which sorting block this thread belongs to
//the index of 2 elements that should be compared and swapped
int s = part * len + (i & ((len / 2) - 1) );
int e = s + len / 2;
if (e >= size) //arr[e] is virtual padding and will not be exchanged with
return;
//calculate the direction of swapping
int monotonicSeqIdx = part / partsInSeq;
bool ascending = (monotonicSeqIdx & 1) != 0;
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= size) //special case for part with no "partner" to be merged with in next phase
ascending = true;
if( ascending == Cmp(Fetch(e), Fetch(s)))
Swap(s, e);
}
template <typename FETCH, typename CMP, typename SWAP>
void bitonicSort(int begin, int end, FETCH Fetch, const CMP& Cmp, SWAP Swap)
{
int size = end - begin;
int paddedSize = closestPow2(size);
int threadsNeeded = size / 2 + (size %2 !=0);
const int maxThreadsPerBlock = 512;
int threadPerBlock = maxThreadsPerBlock;
int blocks = threadsNeeded / threadPerBlock + (threadsNeeded % threadPerBlock != 0);
auto fetchWithOffset =
[=] __cuda_callable__(int i)
{
return Fetch(i + begin);
};
auto swapWithOffset =
[=] __cuda_callable__(int i, int j) mutable
{
};
for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
{
for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
{
bitonicMergeGlobal<<<blocks, threadPerBlock>>>(
size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq);
}
}
cudaDeviceSynchronize();
}