Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#include <TNL/Containers/Array.h>
#include <TNL/Algorithms/ParallelFor.h>
using namespace TNL;
using namespace TNL::Containers;
typedef Devices::Cuda Device;
//---------------------------------------------
__host__ __device__ int closestPow2(int x)
{
if(x ==0)
return 0;
int ret = 1;
while (ret < x)
ret <<= 1;
return ret;
}
//---------------------------------------------
void bitonicSort(ArrayView<int, Device> arr, int begin, int end, bool sortAscending)
{
int arrSize = end - begin;
int paddedSize = closestPow2(arrSize);
auto CmpSwap = [=] __cuda_callable__ (int i, int monotonicSeqLen, int len, int partsInSeq) mutable {
int part = i / (len / 2);
int s = begin + part * len + (i % (len / 2));
int e = s + len / 2;
if (e >= end)
return;
//calculate the direction of swapping
int monotonicSeqIdx = part / partsInSeq;
bool ascending = (monotonicSeqIdx % 2) == 0 ? !sortAscending : sortAscending;
//special case for parts with no "partner"
if ((monotonicSeqIdx + 1) * monotonicSeqLen >= end)
ascending = sortAscending;
//templated size of block
auto &a = arr[s];
auto &b = arr[e];
if ((ascending && a > b) || (!ascending && a < b))
TNL::swap(a, b);
};
for (int monotonicSeqLen = 2; monotonicSeqLen <= paddedSize; monotonicSeqLen *= 2)
{
for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2)
{
Algorithms::ParallelFor< Device>::exec(0, arrSize/2, CmpSwap, monotonicSeqLen, len, partsInSeq);
}
}
}
void bitonicSort(ArrayView<int, Device> arr, bool sortAscending = true)
{
bitonicSort(arr, 0, arr.getSize(), sortAscending);
}