Loading src/TNL/Algorithms/Segments/BiEllpack.h +109 −98 Original line number Diff line number Diff line Loading @@ -15,9 +15,12 @@ #include <TNL/Algorithms/Segments/BiEllpackView.h> #include <TNL/Algorithms/Segments/SegmentView.h> namespace TNL { namespace Algorithms { namespace Segments { namespace TNL { namespace Algorithms { namespace Segments { template <typename Device, typename Index, Loading @@ -27,16 +30,15 @@ template< typename Device, class BiEllpack { public: using DeviceType = Device; using IndexType = std::remove_const_t<Index>; using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >; using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>; static constexpr ElementsOrganization getOrganization() { return Organization; } using ViewType = BiEllpackView< Device, Index, Organization >; using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >; template <typename Device_, typename Index_> using ViewTemplate = BiEllpackView< Device_, Index_, Organization >; using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, Organization >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >; using ConstViewType = typename ViewType::ConstViewType; using SegmentViewType = typename ViewType::SegmentViewType; static constexpr bool havePadding() { return true; }; Loading @@ -60,7 +62,8 @@ class BiEllpack * \brief Number of segments. */ __cuda_callable__ IndexType getSegmentsCount() const; IndexType getSegmentsCount() const; /** * \brief Set sizes of particular segments. Loading @@ -76,16 +79,20 @@ class BiEllpack * \brief Number segments. */ __cuda_callable__ IndexType getSize() const; IndexType getSize() const; __cuda_callable__ IndexType getStorageSize() const; IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const; IndexType getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; SegmentViewType getSegmentView(const IndexType segmentIdx) const; /*** * \brief Go over all segments and for each segment element call Loading @@ -93,12 +100,17 @@ class BiEllpack * When its true, the for-loop continues. Once 'f' returns false, the for-loop * is terminated. */ template< typename Function, typename... Args > void forElements( IndexType first, IndexType last, Function& f, Args... args ) const; template <typename Function> void forElements(IndexType first, IndexType last, Function &&f) const; template <typename Function> void forEachElement(Function &&f) const; template< typename Function, typename... Args > void forEachElement( Function& f, Args... args ) const; template <typename Function> void forSegments(IndexType begin, IndexType end, Function &&f) const; template <typename Function> void forEachSegment(Function &&f) const; /*** * \brief Go over all segments and perform a reduction in each of them. Loading Loading @@ -129,7 +141,6 @@ class BiEllpack void computeColumnSizes(const SizesHolder &segmentsSizes); protected: static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log2(WarpSize); }; Loading src/TNL/Algorithms/Segments/BiEllpack.hpp +33 −6 Original line number Diff line number Diff line Loading @@ -443,12 +443,12 @@ template< typename Device, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forElements( IndexType first, IndexType last, Function& f, Args... args ) const forElements( IndexType first, IndexType last, Function&& f ) const { this->getConstView().forElements( first, last, f, args... ); this->getConstView().forElements( first, last, f ); } template< typename Device, Loading @@ -456,14 +456,41 @@ template< typename Device, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forEachElement( Function& f, Args... args ) const forEachElement( Function&& f ) const { this->forElements( 0, this->getSegmentsCount(), f, args... ); this->forElements( 0, this->getSegmentsCount(), f ); } template< typename Device, typename Index, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forSegments( IndexType begin, IndexType end, Function&& f ) const { this->getConstView().forSegments( begin, end, f ); } template< typename Device, typename Index, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forEachSegment( Function&& f ) const { this->getConstView().forEachSegment( f ); } template< typename Device, typename Index, typename IndexAllocator, Loading src/TNL/Algorithms/Segments/BiEllpackSegmentView.h +12 −5 Original line number Diff line number Diff line Loading @@ -44,10 +44,11 @@ class BiEllpackSegmentView * \param groupsWidth is a static vector containing widths of the strip groups */ __cuda_callable__ BiEllpackSegmentView( const IndexType offset, BiEllpackSegmentView( const IndexType segmentIdx, const IndexType offset, const IndexType inStripIdx, const GroupsWidthType& groupsWidth ) : groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){}; : segmentIdx( segmentIdx ), groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){}; __cuda_callable__ IndexType getSize() const Loading Loading @@ -79,9 +80,15 @@ class BiEllpackSegmentView return offset + inStripIdx + localIdx * groupHeight; }; __cuda_callable__ const IndexType& getSegmentIndex() const { return this->segmentIdx; }; protected: IndexType groupOffset, inStripIdx, segmentSize; IndexType segmentIdx, groupOffset, inStripIdx, segmentSize; GroupsWidthType groupsWidth; }; Loading src/TNL/Algorithms/Segments/BiEllpackView.h +13 −8 Original line number Diff line number Diff line Loading @@ -32,13 +32,13 @@ class BiEllpackView using DeviceType = Device; using IndexType = std::remove_const_t< Index >; using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >; using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >; using ConstOffsetsView = typename OffsetsView::ConstViewType; using ViewType = BiEllpackView; template< typename Device_, typename Index_ > using ViewTemplate = BiEllpackView< Device_, Index_ >; using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; using ViewTemplate = BiEllpackView< Device_, Index_, Organization, WarpSize >; using ConstViewType = BiEllpackView< Device, std::add_const_t< Index >, Organization, WarpSize >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization, WarpSize >; static constexpr bool havePadding() { return true; }; Loading Loading @@ -111,12 +111,17 @@ class BiEllpackView * When its true, the for-loop continues. Once 'f' returns false, the for-loop * is terminated. */ template< typename Function, typename... Args > void forElements( IndexType first, IndexType last, Function& f, Args... args ) const; template< typename Function > void forElements( IndexType first, IndexType last, Function&& f ) const; template< typename Function, typename... Args > void forEachElement( Function& f, Args... args ) const; template< typename Function > void forEachElement( Function&& f ) const; template< typename Function > void forSegments( IndexType begin, IndexType end, Function&& f ) const; template< typename Function > void forEachSegment( Function&& f ) const; /*** * \brief Go over all segments and perform a reduction in each of them. Loading src/TNL/Algorithms/Segments/BiEllpackView.hpp +42 −8 Original line number Diff line number Diff line Loading @@ -129,7 +129,12 @@ template< typename Device, __cuda_callable__ auto BiEllpackView< Device, Index, Organization, WarpSize >:: getConstView() const -> const ConstViewType { return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() ); BiEllpackView* this_ptr = const_cast< BiEllpackView* >( this ); return ConstViewType( size, storageSize, virtualRows, this_ptr->rowPermArray.getView(), this_ptr->groupPointers.getView() ); } template< typename Device, Loading Loading @@ -255,14 +260,14 @@ template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forElements( IndexType first, IndexType last, Function& f, Args... args ) const forElements( IndexType first, IndexType last, Function&& f ) const { const auto segmentsPermutationView = this->rowPermArray.getConstView(); const auto groupPointersView = this->groupPointers.getConstView(); auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable { auto work = [=] __cuda_callable__ ( IndexType segmentIdx ) mutable { const IndexType strip = segmentIdx / getWarpSize(); const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 ); const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize(); Loading Loading @@ -298,19 +303,48 @@ forElements( IndexType first, IndexType last, Function& f, Args... args ) const groupHeight /= 2; } }; Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... ); Algorithms::ParallelFor< DeviceType >::exec( first, last , work ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forEachElement( Function& f, Args... args ) const forEachElement( Function&& f ) const { this->forElements( 0, this->getSegmentsCount(), f, args... ); this->forElements( 0, this->getSegmentsCount(), f ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forSegments( IndexType begin, IndexType end, Function&& function ) const { auto view = this->getConstView(); auto f = [=] __cuda_callable__ ( IndexType segmentIdx ) mutable { auto segment = view.getSegmentView( segmentIdx ); function( segment ); }; TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forEachSegment( Function&& f ) const { this->forSegments( 0, this->getSegmentsCount(), f ); } template< typename Device, Loading Loading
src/TNL/Algorithms/Segments/BiEllpack.h +109 −98 Original line number Diff line number Diff line Loading @@ -15,9 +15,12 @@ #include <TNL/Algorithms/Segments/BiEllpackView.h> #include <TNL/Algorithms/Segments/SegmentView.h> namespace TNL { namespace Algorithms { namespace Segments { namespace TNL { namespace Algorithms { namespace Segments { template <typename Device, typename Index, Loading @@ -27,16 +30,15 @@ template< typename Device, class BiEllpack { public: using DeviceType = Device; using IndexType = std::remove_const_t<Index>; using OffsetsHolder = Containers::Vector< Index, DeviceType, IndexType, IndexAllocator >; using OffsetsHolder = Containers::Vector< IndexType, DeviceType, IndexType, IndexAllocator>; static constexpr ElementsOrganization getOrganization() { return Organization; } using ViewType = BiEllpackView< Device, Index, Organization >; using ViewType = BiEllpackView< Device, Index, Organization, WarpSize >; template <typename Device_, typename Index_> using ViewTemplate = BiEllpackView< Device_, Index_, Organization >; using ConstViewType = BiEllpackView< Device, std::add_const_t< IndexType >, Organization >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; using ViewTemplate = BiEllpackView<Device_, Index_, Organization, WarpSize >; using ConstViewType = typename ViewType::ConstViewType; using SegmentViewType = typename ViewType::SegmentViewType; static constexpr bool havePadding() { return true; }; Loading @@ -60,7 +62,8 @@ class BiEllpack * \brief Number of segments. */ __cuda_callable__ IndexType getSegmentsCount() const; IndexType getSegmentsCount() const; /** * \brief Set sizes of particular segments. Loading @@ -76,16 +79,20 @@ class BiEllpack * \brief Number segments. */ __cuda_callable__ IndexType getSize() const; IndexType getSize() const; __cuda_callable__ IndexType getStorageSize() const; IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const; IndexType getGlobalIndex(const IndexType segmentIdx, const IndexType localIdx) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; SegmentViewType getSegmentView(const IndexType segmentIdx) const; /*** * \brief Go over all segments and for each segment element call Loading @@ -93,12 +100,17 @@ class BiEllpack * When its true, the for-loop continues. Once 'f' returns false, the for-loop * is terminated. */ template< typename Function, typename... Args > void forElements( IndexType first, IndexType last, Function& f, Args... args ) const; template <typename Function> void forElements(IndexType first, IndexType last, Function &&f) const; template <typename Function> void forEachElement(Function &&f) const; template< typename Function, typename... Args > void forEachElement( Function& f, Args... args ) const; template <typename Function> void forSegments(IndexType begin, IndexType end, Function &&f) const; template <typename Function> void forEachSegment(Function &&f) const; /*** * \brief Go over all segments and perform a reduction in each of them. Loading Loading @@ -129,7 +141,6 @@ class BiEllpack void computeColumnSizes(const SizesHolder &segmentsSizes); protected: static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log2(WarpSize); }; Loading
src/TNL/Algorithms/Segments/BiEllpack.hpp +33 −6 Original line number Diff line number Diff line Loading @@ -443,12 +443,12 @@ template< typename Device, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forElements( IndexType first, IndexType last, Function& f, Args... args ) const forElements( IndexType first, IndexType last, Function&& f ) const { this->getConstView().forElements( first, last, f, args... ); this->getConstView().forElements( first, last, f ); } template< typename Device, Loading @@ -456,14 +456,41 @@ template< typename Device, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forEachElement( Function& f, Args... args ) const forEachElement( Function&& f ) const { this->forElements( 0, this->getSegmentsCount(), f, args... ); this->forElements( 0, this->getSegmentsCount(), f ); } template< typename Device, typename Index, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forSegments( IndexType begin, IndexType end, Function&& f ) const { this->getConstView().forSegments( begin, end, f ); } template< typename Device, typename Index, typename IndexAllocator, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpack< Device, Index, IndexAllocator, Organization, WarpSize >:: forEachSegment( Function&& f ) const { this->getConstView().forEachSegment( f ); } template< typename Device, typename Index, typename IndexAllocator, Loading
src/TNL/Algorithms/Segments/BiEllpackSegmentView.h +12 −5 Original line number Diff line number Diff line Loading @@ -44,10 +44,11 @@ class BiEllpackSegmentView * \param groupsWidth is a static vector containing widths of the strip groups */ __cuda_callable__ BiEllpackSegmentView( const IndexType offset, BiEllpackSegmentView( const IndexType segmentIdx, const IndexType offset, const IndexType inStripIdx, const GroupsWidthType& groupsWidth ) : groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){}; : segmentIdx( segmentIdx ), groupOffset( offset ), inStripIdx( inStripIdx ), segmentSize( TNL::sum( groupsWidth ) ), groupsWidth( groupsWidth ){}; __cuda_callable__ IndexType getSize() const Loading Loading @@ -79,9 +80,15 @@ class BiEllpackSegmentView return offset + inStripIdx + localIdx * groupHeight; }; __cuda_callable__ const IndexType& getSegmentIndex() const { return this->segmentIdx; }; protected: IndexType groupOffset, inStripIdx, segmentSize; IndexType segmentIdx, groupOffset, inStripIdx, segmentSize; GroupsWidthType groupsWidth; }; Loading
src/TNL/Algorithms/Segments/BiEllpackView.h +13 −8 Original line number Diff line number Diff line Loading @@ -32,13 +32,13 @@ class BiEllpackView using DeviceType = Device; using IndexType = std::remove_const_t< Index >; using OffsetsView = typename Containers::VectorView< Index, DeviceType, IndexType >; using OffsetsView = typename Containers::VectorView< IndexType, DeviceType, IndexType >; using ConstOffsetsView = typename OffsetsView::ConstViewType; using ViewType = BiEllpackView; template< typename Device_, typename Index_ > using ViewTemplate = BiEllpackView< Device_, Index_ >; using ConstViewType = BiEllpackView< Device, std::add_const_t< Index > >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization >; using ViewTemplate = BiEllpackView< Device_, Index_, Organization, WarpSize >; using ConstViewType = BiEllpackView< Device, std::add_const_t< Index >, Organization, WarpSize >; using SegmentViewType = BiEllpackSegmentView< IndexType, Organization, WarpSize >; static constexpr bool havePadding() { return true; }; Loading Loading @@ -111,12 +111,17 @@ class BiEllpackView * When its true, the for-loop continues. Once 'f' returns false, the for-loop * is terminated. */ template< typename Function, typename... Args > void forElements( IndexType first, IndexType last, Function& f, Args... args ) const; template< typename Function > void forElements( IndexType first, IndexType last, Function&& f ) const; template< typename Function, typename... Args > void forEachElement( Function& f, Args... args ) const; template< typename Function > void forEachElement( Function&& f ) const; template< typename Function > void forSegments( IndexType begin, IndexType end, Function&& f ) const; template< typename Function > void forEachSegment( Function&& f ) const; /*** * \brief Go over all segments and perform a reduction in each of them. Loading
src/TNL/Algorithms/Segments/BiEllpackView.hpp +42 −8 Original line number Diff line number Diff line Loading @@ -129,7 +129,12 @@ template< typename Device, __cuda_callable__ auto BiEllpackView< Device, Index, Organization, WarpSize >:: getConstView() const -> const ConstViewType { return ConstViewType( size, storageSize, virtualRows, rowPermArray.getConstView(), groupPointers.getConstView() ); BiEllpackView* this_ptr = const_cast< BiEllpackView* >( this ); return ConstViewType( size, storageSize, virtualRows, this_ptr->rowPermArray.getView(), this_ptr->groupPointers.getView() ); } template< typename Device, Loading Loading @@ -255,14 +260,14 @@ template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forElements( IndexType first, IndexType last, Function& f, Args... args ) const forElements( IndexType first, IndexType last, Function&& f ) const { const auto segmentsPermutationView = this->rowPermArray.getConstView(); const auto groupPointersView = this->groupPointers.getConstView(); auto work = [=] __cuda_callable__ ( IndexType segmentIdx, Args... args ) mutable { auto work = [=] __cuda_callable__ ( IndexType segmentIdx ) mutable { const IndexType strip = segmentIdx / getWarpSize(); const IndexType firstGroupInStrip = strip * ( getLogWarpSize() + 1 ); const IndexType rowStripPerm = segmentsPermutationView[ segmentIdx ] - strip * getWarpSize(); Loading Loading @@ -298,19 +303,48 @@ forElements( IndexType first, IndexType last, Function& f, Args... args ) const groupHeight /= 2; } }; Algorithms::ParallelFor< DeviceType >::exec( first, last , work, args... ); Algorithms::ParallelFor< DeviceType >::exec( first, last , work ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function, typename... Args > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forEachElement( Function& f, Args... args ) const forEachElement( Function&& f ) const { this->forElements( 0, this->getSegmentsCount(), f, args... ); this->forElements( 0, this->getSegmentsCount(), f ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forSegments( IndexType begin, IndexType end, Function&& function ) const { auto view = this->getConstView(); auto f = [=] __cuda_callable__ ( IndexType segmentIdx ) mutable { auto segment = view.getSegmentView( segmentIdx ); function( segment ); }; TNL::Algorithms::ParallelFor< DeviceType >::exec( begin, end, f ); } template< typename Device, typename Index, ElementsOrganization Organization, int WarpSize > template< typename Function > void BiEllpackView< Device, Index, Organization, WarpSize >:: forEachSegment( Function&& f ) const { this->forSegments( 0, this->getSegmentsCount(), f ); } template< typename Device, Loading