Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +10 −84 Original line number Diff line number Diff line Loading @@ -63,10 +63,7 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading @@ -93,84 +90,16 @@ struct CSRAdaptiveKernel const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } Index &sum ); template< typename Offsets > void init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); void init( const Offsets& offsets ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); void reset(); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); ViewType getView(); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return this->view; }; ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, Loading @@ -185,10 +114,7 @@ struct CSRAdaptiveKernel const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } Args... args ) const; protected: BlocksType blocks; Loading src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +142 −1 Original line number Diff line number Diff line Loading @@ -22,6 +22,147 @@ namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > TNL::String CSRAdaptiveKernel< Index, Device >:: getKernelType() { return ViewType::getKernelType(); }; template< typename Index, typename Device > template< typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } } // namespace Segments } // namespace Algorithms Loading Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h +10 −84 Original line number Diff line number Diff line Loading @@ -63,10 +63,7 @@ struct CSRAdaptiveKernel using BlocksType = typename ViewType::BlocksType; using BlocksView = typename BlocksType::ViewType; static TNL::String getKernelType() { return ViewType::getKernelType(); }; static TNL::String getKernelType(); static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256; Loading @@ -93,84 +90,16 @@ struct CSRAdaptiveKernel const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } Index &sum ); template< typename Offsets > void init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); void init( const Offsets& offsets ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); void reset(); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); ViewType getView(); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } void reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } ViewType getView() { return this->view; }; ConstViewType getConstView() const { return this->view; }; ConstViewType getConstView() const; template< typename OffsetsView, typename Fetch, Loading @@ -185,10 +114,7 @@ struct CSRAdaptiveKernel const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } Args... args ) const; protected: BlocksType blocks; Loading
src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp +142 −1 Original line number Diff line number Diff line Loading @@ -22,6 +22,147 @@ namespace TNL { namespace Algorithms { namespace Segments { template< typename Index, typename Device > TNL::String CSRAdaptiveKernel< Index, Device >:: getKernelType() { return ViewType::getKernelType(); }; template< typename Index, typename Device > template< typename Offsets > Index CSRAdaptiveKernel< Index, Device >:: findLimit( const Index start, const Offsets& offsets, const Index size, details::Type &type, Index &sum ) { sum = 0; for (Index current = start; current < size - 1; current++ ) { Index elements = offsets[ current + 1 ] - offsets[ current ]; sum += elements; if( sum > SHARED_PER_WARP ) { if( current - start > 0 ) // extra row { type = details::Type::STREAM; return current; } else { // one long row if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT ) type = details::Type::VECTOR; else type = details::Type::LONG; return current + 1; } } } type = details::Type::STREAM; return size - 1; // return last row pointer } template< typename Index, typename Device > template< typename Offsets > void CSRAdaptiveKernel< Index, Device >:: init( const Offsets& offsets ) { using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >; HostOffsetsType hostOffsets( offsets ); const Index rows = offsets.getSize(); Index sum, start( 0 ), nextStart( 0 ); // Fill blocks std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks; inBlocks.reserve( rows ); while( nextStart != rows - 1 ) { details::Type type; nextStart = findLimit( start, hostOffsets, rows, type, sum ); if( type == details::Type::LONG ) { const Index blocksCount = inBlocks.size(); const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize(); Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount; if( warpsLeft == 0 ) warpsLeft = warpsPerCudaBlock; for( Index index = 0; index < warpsLeft; index++ ) inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft ); } else { inBlocks.emplace_back(start, type, nextStart, offsets.getElement(nextStart), offsets.getElement(start) ); } start = nextStart; } inBlocks.emplace_back(nextStart); this->blocks = inBlocks; this->view.setBlocks( blocks ); } template< typename Index, typename Device > void CSRAdaptiveKernel< Index, Device >:: reset() { this->blocks.reset(); this->view.setBlocks( blocks ); } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getView() -> ViewType { return this->view; } template< typename Index, typename Device > auto CSRAdaptiveKernel< Index, Device >:: getConstView() const -> ConstViewType { return this->view; }; template< typename Index, typename Device > template< typename OffsetsView, typename Fetch, typename Reduction, typename ResultKeeper, typename Real, typename... Args > void CSRAdaptiveKernel< Index, Device >:: segmentsReduction( const OffsetsView& offsets, Index first, Index last, Fetch& fetch, const Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... ); } } // namespace Segments } // namespace Algorithms Loading