Loading src/bitonicSort/bitonicSort.h +28 −28 Original line number Original line Diff line number Diff line Loading @@ -28,8 +28,8 @@ __host__ __device__ int closestPow2(int x) return ret; return ret; } } template <typename Value, typename Function> template <typename Value, typename CMP> __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) { { if (ascending == Cmp(b, a)) if (ascending == Cmp(b, a)) TNL::swap(a, b); TNL::swap(a, b); Loading @@ -41,9 +41,9 @@ __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Funct * this kernel simulates 1 exchange * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences * splits input arr that is bitonic into 2 bitonic sequences */ */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x; Loading Loading @@ -73,9 +73,9 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device * * * this version uses shared memory to do the operations * this version uses shared memory to do the operations * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { extern __shared__ int externMem[]; extern __shared__ int externMem[]; Loading Loading @@ -133,9 +133,9 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL:: * * * this user only operates on global memory, no shared memory is used * this user only operates on global memory, no shared memory is used * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { //1st index and last index of subarray that this threadBlock should merge //1st index and last index of subarray that this threadBlock should merge Loading Loading @@ -178,10 +178,10 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cud * works independently from other concurrent blocks * works independently from other concurrent blocks * @param sharedMem sharedMem pointer has to be able to store all of src elements * @param sharedMem sharedMem pointer has to be able to store all of src elements * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, Value *sharedMem, const Function &Cmp) Value *sharedMem, const CMP &Cmp) { { //copy from globalMem into sharedMem //copy from globalMem into sharedMem for(int i = threadIdx.x; i < src.getSize(); i += blockDim.x) for(int i = threadIdx.x; i < src.getSize(); i += blockDim.x) Loading Loading @@ -236,9 +236,9 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices * works independently from other concurrent blocks * works independently from other concurrent blocks * this version doesnt use shared memory and is prefered for Value with big size * this version doesnt use shared memory and is prefered for Value with big size * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, const Function &Cmp) const CMP &Cmp) { { int paddedSize = closestPow2_ptx(src.getSize()); int paddedSize = closestPow2_ptx(src.getSize()); Loading Loading @@ -274,8 +274,8 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices * sorts @param arr in alternating order to create bitonic sequences * sorts @param arr in alternating order to create bitonic sequences * sharedMem has to be able to store at least blockDim.x*2 elements * sharedMem has to be able to store at least blockDim.x*2 elements * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp) { { extern __shared__ int externMem[]; extern __shared__ int externMem[]; int sharedMemLen = 2 * blockDim.x; int sharedMemLen = 2 * blockDim.x; Loading @@ -294,8 +294,8 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, * sorts @param arr in alternating order to create bitonic sequences * sorts @param arr in alternating order to create bitonic sequences * doesn't use shared memory * doesn't use shared memory * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp) { { int myBlockStart = blockIdx.x * (2 * blockDim.x); int myBlockStart = blockIdx.x * (2 * blockDim.x); int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); Loading @@ -308,8 +308,8 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices } } //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const Function &Cmp) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const CMP &Cmp) { { TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr = src.getView(begin, end); TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr = src.getView(begin, end); int paddedSize = closestPow2(arr.getSize()); int paddedSize = closestPow2(arr.getSize()); Loading Loading @@ -365,14 +365,14 @@ void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end) { { bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } } template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const CMP &Cmp) { { bitonicSort(arr, 0, arr.getSize(), Cmp); bitonicSort(arr, 0, arr.getSize(), Cmp); } } Loading @@ -384,8 +384,8 @@ void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr) } } //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(std::vector<Value> &vec, int begin, int end, const Function &Cmp) void bitonicSort(std::vector<Value> &vec, int begin, int end, const CMP &Cmp) { { TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec); TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec); auto view = Arr.getView(); auto view = Arr.getView(); Loading @@ -401,8 +401,8 @@ void bitonicSort(std::vector<Value> &vec, int begin, int end) bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } } template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(std::vector<Value> &vec, const Function &Cmp) void bitonicSort(std::vector<Value> &vec, const CMP &Cmp) { { bitonicSort(vec, 0, vec.size(), Cmp); bitonicSort(vec, 0, vec.size(), Cmp); } } Loading @@ -417,8 +417,7 @@ void bitonicSort(std::vector<Value> &vec) //--------------------------------------------- //--------------------------------------------- template <typename FETCH, typename CMP, typename SWAP> template <typename FETCH, typename CMP, typename SWAP> __global__ void bitonicMergeGlobal(int size, FETCH Fetch, __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, const CMP &Cmp, SWAP Swap, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x; Loading Loading @@ -468,7 +467,8 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { { bitonicMergeGlobal<<<blocks, threadPerBlock>>>( bitonicMergeGlobal<<<blocks, threadPerBlock>>>( size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); } } } } cudaDeviceSynchronize(); cudaDeviceSynchronize(); Loading Loading
src/bitonicSort/bitonicSort.h +28 −28 Original line number Original line Diff line number Diff line Loading @@ -28,8 +28,8 @@ __host__ __device__ int closestPow2(int x) return ret; return ret; } } template <typename Value, typename Function> template <typename Value, typename CMP> __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Function &Cmp) __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const CMP &Cmp) { { if (ascending == Cmp(b, a)) if (ascending == Cmp(b, a)) TNL::swap(a, b); TNL::swap(a, b); Loading @@ -41,9 +41,9 @@ __host__ __device__ void cmpSwap(Value &a, Value &b, bool ascending, const Funct * this kernel simulates 1 exchange * this kernel simulates 1 exchange * splits input arr that is bitonic into 2 bitonic sequences * splits input arr that is bitonic into 2 bitonic sequences */ */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x; Loading Loading @@ -73,9 +73,9 @@ __global__ void bitonicMergeGlobal(TNL::Containers::ArrayView<Value, TNL::Device * * * this version uses shared memory to do the operations * this version uses shared memory to do the operations * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { extern __shared__ int externMem[]; extern __shared__ int externMem[]; Loading Loading @@ -133,9 +133,9 @@ __global__ void bitonicMergeSharedMemory(TNL::Containers::ArrayView<Value, TNL:: * * * this user only operates on global memory, no shared memory is used * this user only operates on global memory, no shared memory is used * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp, CMP Cmp, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { //1st index and last index of subarray that this threadBlock should merge //1st index and last index of subarray that this threadBlock should merge Loading Loading @@ -178,10 +178,10 @@ __global__ void bitonicMerge(TNL::Containers::ArrayView<Value, TNL::Devices::Cud * works independently from other concurrent blocks * works independently from other concurrent blocks * @param sharedMem sharedMem pointer has to be able to store all of src elements * @param sharedMem sharedMem pointer has to be able to store all of src elements * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> dst, Value *sharedMem, const Function &Cmp) Value *sharedMem, const CMP &Cmp) { { //copy from globalMem into sharedMem //copy from globalMem into sharedMem for(int i = threadIdx.x; i < src.getSize(); i += blockDim.x) for(int i = threadIdx.x; i < src.getSize(); i += blockDim.x) Loading Loading @@ -236,9 +236,9 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices * works independently from other concurrent blocks * works independently from other concurrent blocks * this version doesnt use shared memory and is prefered for Value with big size * this version doesnt use shared memory and is prefered for Value with big size * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, const Function &Cmp) const CMP &Cmp) { { int paddedSize = closestPow2_ptx(src.getSize()); int paddedSize = closestPow2_ptx(src.getSize()); Loading Loading @@ -274,8 +274,8 @@ __device__ void bitonicSort_Block(TNL::Containers::ArrayView<Value, TNL::Devices * sorts @param arr in alternating order to create bitonic sequences * sorts @param arr in alternating order to create bitonic sequences * sharedMem has to be able to store at least blockDim.x*2 elements * sharedMem has to be able to store at least blockDim.x*2 elements * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp) { { extern __shared__ int externMem[]; extern __shared__ int externMem[]; int sharedMemLen = 2 * blockDim.x; int sharedMemLen = 2 * blockDim.x; Loading @@ -294,8 +294,8 @@ __global__ void bitoniSort1stStepSharedMemory(TNL::Containers::ArrayView<Value, * sorts @param arr in alternating order to create bitonic sequences * sorts @param arr in alternating order to create bitonic sequences * doesn't use shared memory * doesn't use shared memory * */ * */ template <typename Value, typename Function> template <typename Value, typename CMP> __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, CMP Cmp) { { int myBlockStart = blockIdx.x * (2 * blockDim.x); int myBlockStart = blockIdx.x * (2 * blockDim.x); int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); int myBlockEnd = TNL::min(arr.getSize(), myBlockStart + (2 * blockDim.x)); Loading @@ -308,8 +308,8 @@ __global__ void bitoniSort1stStep(TNL::Containers::ArrayView<Value, TNL::Devices } } //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const Function &Cmp) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int begin, int end, const CMP &Cmp) { { TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr = src.getView(begin, end); TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr = src.getView(begin, end); int paddedSize = closestPow2(arr.getSize()); int paddedSize = closestPow2(arr.getSize()); Loading Loading @@ -365,14 +365,14 @@ void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> src, int //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, int begin, int end) { { bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); bitonicSort(arr, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } } template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const Function &Cmp) void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr, const CMP &Cmp) { { bitonicSort(arr, 0, arr.getSize(), Cmp); bitonicSort(arr, 0, arr.getSize(), Cmp); } } Loading @@ -384,8 +384,8 @@ void bitonicSort(TNL::Containers::ArrayView<Value, TNL::Devices::Cuda> arr) } } //--------------------------------------------- //--------------------------------------------- template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(std::vector<Value> &vec, int begin, int end, const Function &Cmp) void bitonicSort(std::vector<Value> &vec, int begin, int end, const CMP &Cmp) { { TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec); TNL::Containers::Array<Value, TNL::Devices::Cuda> Arr(vec); auto view = Arr.getView(); auto view = Arr.getView(); Loading @@ -401,8 +401,8 @@ void bitonicSort(std::vector<Value> &vec, int begin, int end) bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); bitonicSort(vec, begin, end, [] __cuda_callable__(const Value &a, const Value &b) { return a < b; }); } } template <typename Value, typename Function> template <typename Value, typename CMP> void bitonicSort(std::vector<Value> &vec, const Function &Cmp) void bitonicSort(std::vector<Value> &vec, const CMP &Cmp) { { bitonicSort(vec, 0, vec.size(), Cmp); bitonicSort(vec, 0, vec.size(), Cmp); } } Loading @@ -417,8 +417,7 @@ void bitonicSort(std::vector<Value> &vec) //--------------------------------------------- //--------------------------------------------- template <typename FETCH, typename CMP, typename SWAP> template <typename FETCH, typename CMP, typename SWAP> __global__ void bitonicMergeGlobal(int size, FETCH Fetch, __global__ void bitonicMergeGlobal(int size, FETCH Fetch, CMP Cmp, SWAP Swap, const CMP &Cmp, SWAP Swap, int monotonicSeqLen, int len, int partsInSeq) int monotonicSeqLen, int len, int partsInSeq) { { int i = blockIdx.x * blockDim.x + threadIdx.x; int i = blockIdx.x * blockDim.x + threadIdx.x; Loading Loading @@ -468,7 +467,8 @@ void bitonicSort(int begin, int end, FETCH Fetch, const CMP &Cmp, SWAP Swap) for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) for (int len = monotonicSeqLen, partsInSeq = 1; len > 1; len /= 2, partsInSeq *= 2) { { bitonicMergeGlobal<<<blocks, threadPerBlock>>>( bitonicMergeGlobal<<<blocks, threadPerBlock>>>( size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); size, fetchWithOffset, Cmp, swapWithOffset, monotonicSeqLen, len, partsInSeq); } } } } cudaDeviceSynchronize(); cudaDeviceSynchronize(); Loading