Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +145 −68 Original line number Diff line number Diff line Loading @@ -176,85 +176,73 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, #endif template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory, bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || std::is_same< Device, Devices::Host >::value > struct CSRAdaptiveKernelSegmentsReductionDispatcher; template< typename Index, typename Device > auto CSRAdaptiveKernelView< Index, Device >:: getView() -> ViewType typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory > struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, true > { return *this; }; template< typename Index, typename Device > auto CSRAdaptiveKernelView< Index, Device >:: getConstView() const -> ConstViewType { return *this; } template< typename Index, typename Device > TNL::String CSRAdaptiveKernelView< Index, Device >:: getKernelType() template< typename BlocksView, typename Offsets, typename Real, typename... Args > static void reduce( const Offsets& offsets, const BlocksView& blocks, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args) { return "Adaptive"; TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename Index, typename Device > template< typename OffsetsView, typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory > struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, false > { template< typename BlocksView, typename Offsets, typename Real, typename... Args > void CSRAdaptiveKernelView< Index, Device >:: segmentsReduction( const OffsetsView& offsets, static void reduce( const Offsets& offsets, const BlocksView& blocks, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const Args... args) { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } //constexpr int warpSize = 32; Index blocksCount; const Index threads = details::CSRAdaptiveKernelParameters< Real, StreamedMemory >::CudaBlockSize(); const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block /* Execute kernels on device */ // Fill blocks size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block // Execute kernels on device for( Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if( Cuda::getMaxGridSize() * threads >= neededThreads ) Loading @@ -269,11 +257,12 @@ segmentsReduction( const OffsetsView& offsets, } segmentsReductionCSRAdaptiveKernel< StreamedMemory, BlocksView, OffsetsView, Offsets, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, blocks, gridIdx, offsets, first, Loading @@ -286,11 +275,99 @@ segmentsReduction( const OffsetsView& offsets, } #endif } }; template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >& CSRAdaptiveKernelView< Index, Device >:: typename Device, int StreamedMemory > CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device, int StreamedMemory > void CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getBlocks() const -> const BlocksView& { return this->blocks; } template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getView() -> ViewType { return *this; }; template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getConstView() const -> ConstViewType { return *this; } template< typename Index, typename Device, int StreamedMemory > TNL::String CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getKernelType() { return "Adaptive"; } template< typename Index, typename Device, int StreamedMemory > template< typename Offsets, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: segmentsReduction( const Offsets& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory >::template reduce< BlocksView, Offsets, Real, Args... >( offsets, this->getBlocks(), first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, typename Device, int StreamedMemory > CSRAdaptiveKernelView< Index, Device, StreamedMemory >& CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp +145 −68 Original line number Diff line number Diff line Loading @@ -176,85 +176,73 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks, #endif template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernelView< Index, Device >:: setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory, bool DispatchScalarCSR = details::CheckFetchLambda< Index, Fetch >::hasAllParameters() || std::is_same< Device, Devices::Host >::value > struct CSRAdaptiveKernelSegmentsReductionDispatcher; template< typename Index, typename Device > auto CSRAdaptiveKernelView< Index, Device >:: getView() -> ViewType typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory > struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, true > { return *this; }; template< typename Index, typename Device > auto CSRAdaptiveKernelView< Index, Device >:: getConstView() const -> ConstViewType { return *this; } template< typename Index, typename Device > TNL::String CSRAdaptiveKernelView< Index, Device >:: getKernelType() template< typename BlocksView, typename Offsets, typename Real, typename... Args > static void reduce( const Offsets& offsets, const BlocksView& blocks, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args) { return "Adaptive"; TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } }; template< typename Index, typename Device > template< typename OffsetsView, typename Device, typename Fetch, typename Reduction, typename ResultKeeper, int StreamedMemory > struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, false > { template< typename BlocksView, typename Offsets, typename Real, typename... Args > void CSRAdaptiveKernelView< Index, Device >:: segmentsReduction( const OffsetsView& offsets, static void reduce( const Offsets& offsets, const BlocksView& blocks, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const Args... args) { #ifdef HAVE_CUDA if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() ) { TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >:: segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); return; } //constexpr int warpSize = 32; Index blocksCount; const Index threads = details::CSRAdaptiveKernelParameters< Real, StreamedMemory >::CudaBlockSize(); const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); /* Fill blocks */ size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block /* Execute kernels on device */ // Fill blocks size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block // Execute kernels on device for( Index gridIdx = 0; neededThreads != 0; gridIdx++ ) { if( Cuda::getMaxGridSize() * threads >= neededThreads ) Loading @@ -269,11 +257,12 @@ segmentsReduction( const OffsetsView& offsets, } segmentsReductionCSRAdaptiveKernel< StreamedMemory, BlocksView, OffsetsView, Offsets, Index, Fetch, Reduction, ResultKeeper, Real, Args... > <<<blocksCount, threads>>>( this->blocks, blocks, gridIdx, offsets, first, Loading @@ -286,11 +275,99 @@ segmentsReduction( const OffsetsView& offsets, } #endif } }; template< typename Index, typename Device > CSRAdaptiveKernelView< Index, Device >& CSRAdaptiveKernelView< Index, Device >:: typename Device, int StreamedMemory > CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: CSRAdaptiveKernelView( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device, int StreamedMemory > void CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: setBlocks( BlocksType& blocks ) { this->blocks.bind( blocks ); } template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getBlocks() const -> const BlocksView& { return this->blocks; } template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getView() -> ViewType { return *this; }; template< typename Index, typename Device, int StreamedMemory > auto CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getConstView() const -> ConstViewType { return *this; } template< typename Index, typename Device, int StreamedMemory > TNL::String CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: getKernelType() { return "Adaptive"; } template< typename Index, typename Device, int StreamedMemory > template< typename Offsets, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: segmentsReduction( const Offsets& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory >::template reduce< BlocksView, Offsets, Real, Args... >( offsets, this->getBlocks(), first, last, fetch, reduction, keeper, zero, args... ); } template< typename Index, typename Device, int StreamedMemory > CSRAdaptiveKernelView< Index, Device, StreamedMemory >& CSRAdaptiveKernelView< Index, Device, StreamedMemory >:: operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView ) { this->blocks.bind( kernelView.blocks ); Loading