Loading src/TNL/Algorithms/Sorting/detail/Quicksorter.h +1 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,6 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Sorting/detail/task.h> namespace TNL { Loading Loading @@ -94,7 +93,7 @@ class Quicksorter< Value, Devices::Cuda > Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping; Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem; Containers::Array<int, Devices::Cuda> cuda_reductionTaskInitMem; int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; int iteration = 0; Loading src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp +4 −15 Original line number Diff line number Diff line Loading @@ -17,7 +17,8 @@ #include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h> #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h> #include <TNL/Algorithms/Sorting/detail/Quicksorter.h> #include <TNL/Algorithms/Scan.h> #include <TNL/Algorithms/reduce.h> #include <TNL/Algorithms/scan.h> namespace TNL { namespace Algorithms { Loading Loading @@ -314,8 +315,7 @@ int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::C int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 ); return reduce< Devices::Cuda >( 0, quicksort.host_1stPhaseTasksAmount, fetch, TNL::Plus{} ); } template< typename Value > Loading @@ -323,14 +323,6 @@ int Quicksorter< Value, Devices::Cuda >:: getSetsNeeded(int elemPerBlock) const { /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { const auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/ return getSetsNeededFunction< Value >( elemPerBlock, *this ); } Loading Loading @@ -372,10 +364,7 @@ initTasks(int elemPerBlock, const CMP &Cmp) cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); //cuda_reductionTaskInitMem[i] == how many blocks task i needs //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >:: perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0); inplaceInclusiveScan(cuda_reductionTaskInitMem); //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); Loading src/TNL/Algorithms/Sorting/detail/cudaPartition.h +6 −3 Original line number Diff line number Diff line Loading @@ -13,8 +13,8 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/task.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -185,8 +185,11 @@ __device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src, int smaller = 0, bigger = 0; countElem(srcView, Cmp, smaller, bigger, pivot); int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); //synchronization is in this function already using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >; __shared__ typename BlockScan::Storage storage; int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage ); int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage ); if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values { Loading src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h +5 −4 Original line number Diff line number Diff line Loading @@ -15,8 +15,7 @@ #include <TNL/Containers/Array.h> #include "cassert" #include <TNL/Algorithms/Sorting/detail/bitonicSort.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/cudaPartition.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -134,8 +133,10 @@ __device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices: countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); //synchronization is in this function already int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >; __shared__ typename BlockScan::Storage storage; int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage ); int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage ); if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src { Loading src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h +2 −4 Original line number Diff line number Diff line Loading @@ -13,8 +13,6 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/task.h> #include <TNL/Algorithms/Sorting/detail/cudaPartition.h> #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h> Loading @@ -33,7 +31,7 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock, Containers::VectorView<int, Devices::Cuda> blocksNeeded) Containers::ArrayView<int, Devices::Cuda> blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= cuda_tasks.getSize()) Loading @@ -49,7 +47,7 @@ __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> template <typename Value, typename CMP> __global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping, Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem, Containers::ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem, Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) Loading Loading
src/TNL/Algorithms/Sorting/detail/Quicksorter.h +1 −2 Original line number Diff line number Diff line Loading @@ -13,7 +13,6 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Sorting/detail/task.h> namespace TNL { Loading Loading @@ -94,7 +93,7 @@ class Quicksorter< Value, Devices::Cuda > Containers::Array<int, Devices::Cuda> cuda_newTasksAmount, cuda_2ndPhaseTasksAmount; //is in reality 1 integer each Containers::Array<int, Devices::Cuda> cuda_blockToTaskMapping; Containers::Vector<int, Devices::Cuda> cuda_reductionTaskInitMem; Containers::Array<int, Devices::Cuda> cuda_reductionTaskInitMem; int host_1stPhaseTasksAmount = 0, host_2ndPhaseTasksAmount = 0; int iteration = 0; Loading
src/TNL/Algorithms/Sorting/detail/Quicksorter.hpp +4 −15 Original line number Diff line number Diff line Loading @@ -17,7 +17,8 @@ #include <TNL/Algorithms/Sorting/detail/quicksort_kernel.h> #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h> #include <TNL/Algorithms/Sorting/detail/Quicksorter.h> #include <TNL/Algorithms/Scan.h> #include <TNL/Algorithms/reduce.h> #include <TNL/Algorithms/scan.h> namespace TNL { namespace Algorithms { Loading Loading @@ -314,8 +315,7 @@ int getSetsNeededFunction(int elemPerBlock, const Quicksorter< Value, Devices::C int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::reduce<Devices::Cuda>( 0, quicksort.host_1stPhaseTasksAmount, fetch, reduction, 0 ); return reduce< Devices::Cuda >( 0, quicksort.host_1stPhaseTasksAmount, fetch, TNL::Plus{} ); } template< typename Value > Loading @@ -323,14 +323,6 @@ int Quicksorter< Value, Devices::Cuda >:: getSetsNeeded(int elemPerBlock) const { /*auto view = iteration % 2 == 0 ? cuda_tasks.getConstView() : cuda_newTasks.getConstView(); auto fetch = [=] __cuda_callable__(int i) { const auto &task = view[i]; int size = task.partitionEnd - task.partitionBegin; return size / elemPerBlock + (size % elemPerBlock != 0); }; auto reduction = [] __cuda_callable__(int a, int b) { return a + b; }; return Algorithms::reduce<Devices::Cuda>(0, host_1stPhaseTasksAmount, fetch, reduction, 0);*/ return getSetsNeededFunction< Value >( elemPerBlock, *this ); } Loading Loading @@ -372,10 +364,7 @@ initTasks(int elemPerBlock, const CMP &Cmp) cuda_reductionTaskInitMem.getView(0, host_1stPhaseTasksAmount)); //cuda_reductionTaskInitMem[i] == how many blocks task i needs //auto reduce = [] __cuda_callable__(const int &a, const int &b) { return a + b; }; Algorithms::Scan<Devices::Cuda, Algorithms::ScanType::Inclusive >:: perform(cuda_reductionTaskInitMem, 0, cuda_reductionTaskInitMem.getSize(), TNL::Plus{}, 0); inplaceInclusiveScan(cuda_reductionTaskInitMem); //cuda_reductionTaskInitMem[i] == how many blocks task [0..i] need int blocksNeeded = cuda_reductionTaskInitMem.getElement(host_1stPhaseTasksAmount - 1); Loading
src/TNL/Algorithms/Sorting/detail/cudaPartition.h +6 −3 Original line number Diff line number Diff line Loading @@ -13,8 +13,8 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/task.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -185,8 +185,11 @@ __device__ void cudaPartition( Containers::ArrayView<Value, Devices::Cuda> src, int smaller = 0, bigger = 0; countElem(srcView, Cmp, smaller, bigger, pivot); int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); //synchronization is in this function already using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >; __shared__ typename BlockScan::Storage storage; int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage ); int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage ); if (threadIdx.x == blockDim.x - 1) //last thread in block has sum of all values { Loading
src/TNL/Algorithms/Sorting/detail/quicksort_1Block.h +5 −4 Original line number Diff line number Diff line Loading @@ -15,8 +15,7 @@ #include <TNL/Containers/Array.h> #include "cassert" #include <TNL/Algorithms/Sorting/detail/bitonicSort.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/cudaPartition.h> #include <TNL/Algorithms/detail/CudaScanKernel.h> namespace TNL { namespace Algorithms { Loading Loading @@ -134,8 +133,10 @@ __device__ void singleBlockQuickSort( Containers::ArrayView<Value, TNL::Devices: countElem(src.getView(begin, end), Cmp, smaller, bigger, pivot); //synchronization is in this function already int smallerPrefSumInc = blockInclusivePrefixSum(smaller); int biggerPrefSumInc = blockInclusivePrefixSum(bigger); using BlockScan = Algorithms::detail::CudaBlockScan< Algorithms::detail::ScanType::Inclusive, 0, TNL::Plus, int >; __shared__ typename BlockScan::Storage storage; int smallerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, smaller, threadIdx.x, storage ); int biggerPrefSumInc = BlockScan::scan( TNL::Plus{}, 0, bigger, threadIdx.x, storage ); if (threadIdx.x == blockDim.x - 1) //has sum of all smaller and greater elements than pivot in src { Loading
src/TNL/Algorithms/Sorting/detail/quicksort_kernel.h +2 −4 Original line number Diff line number Diff line Loading @@ -13,8 +13,6 @@ #pragma once #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/Sorting/detail/reduction.h> #include <TNL/Algorithms/Sorting/detail/task.h> #include <TNL/Algorithms/Sorting/detail/cudaPartition.h> #include <TNL/Algorithms/Sorting/detail/quicksort_1Block.h> Loading @@ -33,7 +31,7 @@ __device__ void writeNewTask(int begin, int end, int iteration, int maxElemFor2n //----------------------------------------------------------- __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, int elemPerBlock, Containers::VectorView<int, Devices::Cuda> blocksNeeded) Containers::ArrayView<int, Devices::Cuda> blocksNeeded) { int i = blockIdx.x * blockDim.x + threadIdx.x; if (i >= cuda_tasks.getSize()) Loading @@ -49,7 +47,7 @@ __global__ void cudaCalcBlocksNeeded(Containers::ArrayView<TASK, Devices::Cuda> template <typename Value, typename CMP> __global__ void cudaInitTask(Containers::ArrayView<TASK, Devices::Cuda> cuda_tasks, Containers::ArrayView<int, Devices::Cuda> cuda_blockToTaskMapping, Containers::VectorView<int, Devices::Cuda> cuda_reductionTaskInitMem, Containers::ArrayView<int, Devices::Cuda> cuda_reductionTaskInitMem, Containers::ArrayView<Value, Devices::Cuda> src, CMP Cmp) { if (blockIdx.x >= cuda_tasks.getSize()) Loading