Loading src/TNL/Containers/Segments/BiEllpack.h +33 −2 Original line number Diff line number Diff line Loading @@ -75,7 +75,7 @@ class BiEllpack IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading Loading @@ -117,7 +117,23 @@ class BiEllpack static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log( WarpSize ); }; static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; template< typename SizesHolder = OffsetsHolder > void performRowBubbleSort( const SizesHolder& segmentsSize ); template< typename SizesHolder = OffsetsHolder > void computeColumnSizes( const SizesHolder& segmentsSizes ); template< typename SizesHolder = OffsetsHolder > void verifyRowPerm( const SizesHolder& segmentsSizes ); template< typename SizesHolder = OffsetsHolder > void verifyRowLengths( const SizesHolder& segmentsSizes ); IndexType getStripLength( const IndexType stripIdx ) const; IndexType getGroupLength( const IndexType strip, const IndexType group ) const; IndexType size = 0, storageSize = 0; Loading @@ -127,6 +143,21 @@ class BiEllpack OffsetsHolder groupPointers; // TODO: Replace later __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const { if( exponent >= 0 ) { IndexType result = 1; for( IndexType i = 0; i < exponent; i++ ) result *= number; return result; } return 0; }; template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ > friend class BiEllpack; }; Loading src/TNL/Containers/Segments/BiEllpack.hpp +242 −10 Original line number Diff line number Diff line Loading @@ -10,6 +10,7 @@ #pragma once #include <math.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Containers/Segments/BiEllpack.h> Loading Loading @@ -113,20 +114,240 @@ template< typename Device, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: performRowBubbleSort( const SizesHolder& segmentsSizes ) { this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } ); if( std::is_same< DeviceType, Devices::Host >::value ) { IndexType strips = this->virtualRows / getWarpSize(); for( IndexType i = 0; i < strips; i++ ) { IndexType begin = i * getWarpSize(); IndexType end = ( i + 1 ) * getWarpSize() - 1; if(this->getSize() - 1 < end) end = this->getSize() - 1; bool sorted = false; IndexType permIndex1, permIndex2, offset = 0; while( !sorted ) { sorted = true; for( IndexType j = begin + offset; j < end - offset; j++ ) { for( IndexType k = begin; k < end + 1; k++ ) { if( this->rowPermArray.getElement( k ) == j ) permIndex1 = k; if( this->rowPermArray.getElement( k ) == j + 1 ) permIndex2 = k; } if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) ) { IndexType temp = this->rowPermArray.getElement( permIndex1 ); this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); this->rowPermArray.setElement( permIndex2, temp ); sorted = false; } } for( IndexType j = end - 1 - offset; j > begin + offset; j-- ) { for( IndexType k = begin; k < end + 1; k++ ) { if( this->rowPermArray.getElement( k ) == j ) permIndex1 = k; if( this->rowPermArray.getElement( k ) == j - 1 ) permIndex2 = k; } if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) ) { IndexType temp = this->rowPermArray.getElement( permIndex1 ); this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); this->rowPermArray.setElement( permIndex2, temp ); sorted = false; } } offset++; } } } } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: computeColumnSizes( const SizesHolder& segmentsSizes ) { IndexType numberOfStrips = this->virtualRows / getWarpSize(); auto groupPointersView = this->groupPointers.getView(); auto segmentsPermutationView = this->rowPermArray.getView(); auto segmentsSizesView = segmentsSizes.getConstView(); const IndexType size = this->getSize(); Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->virtualRows / getWarpSize(), [=] __cuda_callable__ ( const IndexType strip ) mutable { IndexType firstSegment = strip * getWarpSize(); IndexType groupBegin = strip * ( getLogWarpSize() + 1 ); IndexType emptyGroups = 0; //// // The last strip can be shorter if( strip == numberOfStrips - 1 ) { IndexType segmentsCount = size - firstSegment; while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) ) emptyGroups++; for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) groupPointersView[ group ] = 0; } IndexType allocatedColumns = 0; for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ ) { IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 ); IndexType permSegm = 0; while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment ) permSegm++; const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns; const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 ); const IndexType groupSize = groupWidth * groupHeight; allocatedColumns = segmentsSizes[ permSegm + firstSegment ]; groupPointersView[ groupIdx + groupBegin ] = groupSize; } } ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: verifyRowPerm( const SizesHolder& segmentsSizes ) { bool ok = true; IndexType numberOfStrips = this->virtualRows / getWarpSize(); for( IndexType strip = 0; strip < numberOfStrips; strip++ ) { IndexType begin = strip * getWarpSize(); IndexType end = ( strip + 1 ) * getWarpSize(); if( this->getSize() < end ) end = this->getSize(); for( IndexType i = begin; i < end - 1; i++ ) { IndexType permIndex1, permIndex2; bool first = false; bool second = false; for( IndexType j = begin; j < end; j++ ) { if( this->rowPermArray.getElement( j ) == i ) { permIndex1 = j; first = true; } if( this->rowPermArray.getElement( j ) == i + 1 ) { permIndex2 = j; second = true; } } if( !first || !second ) std::cout << "Wrong permutation!" << std::endl; if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) ) continue; else ok = false; } } if( !ok ) throw( std::logic_error( "Segments permutaion verification failed." ) ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: verifyRowLengths( const SizesHolder& segmentsSizes ) { bool ok = true; for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments; const IndexType strip = segmentIdx / getWarpSize(); const IndexType stripLength = this->getStripLength( strip ); const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip; const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize(); const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength; IndexType elementPtr = begin; IndexType rowLength = 0; const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx ); for( IndexType group = 0; group < groupsCount; group++ ) { for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ ) { IndexType biElementPtr = elementPtr; for( IndexType j = 0; j < this->power( 2, group ); j++ ) { rowLength++; biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength; } elementPtr++; } } if( segmentsSizes.getElement( segmentIdx ) > rowLength ) ok = false; } if( ! ok ) throw( std::logic_error( "Segments capacities verification failed." ) ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) { //if( std::is_same< DeviceType, Devices::Host >::value ) // { this->size = segmentsSizes.getSize(); if( this->size % WarpSize != 0 ) this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() ); else this->virtualRows = this->size; IndexType strips = this->virtualRows / getWarpSize(); this->rowPermArray.setSize( this->size ); this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 ); this->groupPointers = 0; this->performRowBubbleSort( segmentsSizes ); this->computeColumnSizes( segmentsSizes ); this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); this->verifyRowPerm( segmentsSizes ); this->verifyRowLengths( segmentsSizes ); this->storageSize = getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) ); /*} else { BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments; Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes ); hostSegments.setSegmentsSizes( hostSegmentsSizes ); *this = hostSegments; } }*/ } template< typename Device, Loading Loading @@ -182,7 +403,7 @@ template< typename Device, bool RowMajorOrder, int WarpSize > __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType { return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowPermArray.getConstView(), Loading Loading @@ -308,11 +529,22 @@ template< typename Device, typename IndexAllocator, bool RowMajorOrder, int WarpSize > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: printStructure( std::ostream& str ) auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getStripLength( const IndexType stripIdx ) const -> IndexType { return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType { this->getView().printStructure( str ); return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 ) - this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group ); } } // namespace Segments Loading src/TNL/Containers/Segments/BiEllpackSegmentView.h +39 −51 Original line number Diff line number Diff line Loading @@ -10,64 +10,42 @@ #pragma once #include <TNL/Containers/StaticVector.h> namespace TNL { namespace Containers { namespace Segments { template< typename Index, bool RowMajorOrder = false > class BiEllpackSegmentView; template< typename Index > class BiEllpackSegmentView< Index, false > bool RowMajorOrder = false, int WarpSize = 32 > class BiEllpackSegmentView { public: using IndexType = Index; __cuda_callable__ BiEllpackSegmentView( const IndexType offset, const IndexType size, const IndexType chunkSize, // this is only for compatibility with the following specialization const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17 : segmentOffset( offset ), segmentSize( size ){}; __cuda_callable__ BiEllpackSegmentView( const BiEllpackSegmentView& view ) : segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){}; __cuda_callable__ IndexType getSize() const { return this->segmentSize; }; __cuda_callable__ IndexType getGlobalIndex( const IndexType localIndex ) const { TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." ); return segmentOffset + localIndex; }; static constexpr int getWarpSize() { return WarpSize; }; protected: static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; IndexType segmentOffset, segmentSize; }; template< typename Index > class BiEllpackSegmentView< Index, true > { public: static constexpr int getGroupsCount() { return getLogWarpSize() + 1; }; using IndexType = Index; using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >; /** * \brief Constructor. * * \param offset is offset of the first group of the strip the segment belongs to. * \param size is the segment size * \param inStripIdx is index of the segment within its strip. * \param groupsWidth is a static vector containing widths of the strip groups */ __cuda_callable__ BiEllpackSegmentView( const IndexType offset, const IndexType size, const IndexType chunkSize, const IndexType chunksInSlice ) : segmentOffset( offset ), segmentSize( size ), chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){}; const IndexType inStripIdx, const GroupsWidthType& groupsWidth ) : groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){}; __cuda_callable__ IndexType getSize() const Loading @@ -76,17 +54,27 @@ class BiEllpackSegmentView< Index, true > }; __cuda_callable__ IndexType getGlobalIndex( const IndexType localIdx ) const IndexType getGlobalIndex( IndexType localIdx ) const { IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() ); while( localIdx > groupsWidth[ i ] ) { TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." ); const IndexType chunkIdx = localIdx / chunkSize; const IndexType inChunkOffset = localIdx % chunkSize; return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx; localIdx -= groupsWidth[ i ]; offset += groupsWidth[ i++ ] * groupHeight; groupHeight /= 2; } TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." ); if( RowMajorOrder ) return offset + inStripIdx * groupsWidth[ i ] + localIdx; else return offset + inStripIdx + localIdx * groupHeight; }; protected: IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice; IndexType groupOffset, inStripIdx, segmentSize; GroupsWidthType groupsWidth; }; } //namespace Segments Loading src/TNL/Containers/Segments/BiEllpackView.h +1 −1 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ class BiEllpackView static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log( WarpSize ); }; static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; IndexType size = 0, storageSize = 0; Loading src/TNL/Containers/Segments/BiEllpackView.hpp +3 −2 Original line number Diff line number Diff line Loading @@ -285,6 +285,7 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; } template< typename Device, Loading @@ -310,8 +311,8 @@ operator=( const BiEllpackView& source ) this->size = source.size; this->storageSize = source.storageSize; this->virtualRows = source.virtualRows; this->rowPermArray = source.rowPermArray; this->groupPointers = source.groupPointers; this->rowPermArray.bind( source.rowPermArray ); this->groupPointers.bind( source.groupPointers ); return *this; } Loading Loading
src/TNL/Containers/Segments/BiEllpack.h +33 −2 Original line number Diff line number Diff line Loading @@ -75,7 +75,7 @@ class BiEllpack IndexType getStorageSize() const; __cuda_callable__ IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const; IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const; __cuda_callable__ SegmentViewType getSegmentView( const IndexType segmentIdx ) const; Loading Loading @@ -117,7 +117,23 @@ class BiEllpack static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log( WarpSize ); }; static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; template< typename SizesHolder = OffsetsHolder > void performRowBubbleSort( const SizesHolder& segmentsSize ); template< typename SizesHolder = OffsetsHolder > void computeColumnSizes( const SizesHolder& segmentsSizes ); template< typename SizesHolder = OffsetsHolder > void verifyRowPerm( const SizesHolder& segmentsSizes ); template< typename SizesHolder = OffsetsHolder > void verifyRowLengths( const SizesHolder& segmentsSizes ); IndexType getStripLength( const IndexType stripIdx ) const; IndexType getGroupLength( const IndexType strip, const IndexType group ) const; IndexType size = 0, storageSize = 0; Loading @@ -127,6 +143,21 @@ class BiEllpack OffsetsHolder groupPointers; // TODO: Replace later __cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const { if( exponent >= 0 ) { IndexType result = 1; for( IndexType i = 0; i < exponent; i++ ) result *= number; return result; } return 0; }; template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ > friend class BiEllpack; }; Loading
src/TNL/Containers/Segments/BiEllpack.hpp +242 −10 Original line number Diff line number Diff line Loading @@ -10,6 +10,7 @@ #pragma once #include <math.h> #include <TNL/Containers/Vector.h> #include <TNL/Algorithms/ParallelFor.h> #include <TNL/Containers/Segments/BiEllpack.h> Loading Loading @@ -113,20 +114,240 @@ template< typename Device, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: performRowBubbleSort( const SizesHolder& segmentsSizes ) { this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } ); if( std::is_same< DeviceType, Devices::Host >::value ) { IndexType strips = this->virtualRows / getWarpSize(); for( IndexType i = 0; i < strips; i++ ) { IndexType begin = i * getWarpSize(); IndexType end = ( i + 1 ) * getWarpSize() - 1; if(this->getSize() - 1 < end) end = this->getSize() - 1; bool sorted = false; IndexType permIndex1, permIndex2, offset = 0; while( !sorted ) { sorted = true; for( IndexType j = begin + offset; j < end - offset; j++ ) { for( IndexType k = begin; k < end + 1; k++ ) { if( this->rowPermArray.getElement( k ) == j ) permIndex1 = k; if( this->rowPermArray.getElement( k ) == j + 1 ) permIndex2 = k; } if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) ) { IndexType temp = this->rowPermArray.getElement( permIndex1 ); this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); this->rowPermArray.setElement( permIndex2, temp ); sorted = false; } } for( IndexType j = end - 1 - offset; j > begin + offset; j-- ) { for( IndexType k = begin; k < end + 1; k++ ) { if( this->rowPermArray.getElement( k ) == j ) permIndex1 = k; if( this->rowPermArray.getElement( k ) == j - 1 ) permIndex2 = k; } if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) ) { IndexType temp = this->rowPermArray.getElement( permIndex1 ); this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) ); this->rowPermArray.setElement( permIndex2, temp ); sorted = false; } } offset++; } } } } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: computeColumnSizes( const SizesHolder& segmentsSizes ) { IndexType numberOfStrips = this->virtualRows / getWarpSize(); auto groupPointersView = this->groupPointers.getView(); auto segmentsPermutationView = this->rowPermArray.getView(); auto segmentsSizesView = segmentsSizes.getConstView(); const IndexType size = this->getSize(); Algorithms::ParallelFor< DeviceType >::exec( ( IndexType ) 0, this->virtualRows / getWarpSize(), [=] __cuda_callable__ ( const IndexType strip ) mutable { IndexType firstSegment = strip * getWarpSize(); IndexType groupBegin = strip * ( getLogWarpSize() + 1 ); IndexType emptyGroups = 0; //// // The last strip can be shorter if( strip == numberOfStrips - 1 ) { IndexType segmentsCount = size - firstSegment; while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) ) emptyGroups++; for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ ) groupPointersView[ group ] = 0; } IndexType allocatedColumns = 0; for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ ) { IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 ); IndexType permSegm = 0; while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment ) permSegm++; const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns; const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 ); const IndexType groupSize = groupWidth * groupHeight; allocatedColumns = segmentsSizes[ permSegm + firstSegment ]; groupPointersView[ groupIdx + groupBegin ] = groupSize; } } ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: verifyRowPerm( const SizesHolder& segmentsSizes ) { bool ok = true; IndexType numberOfStrips = this->virtualRows / getWarpSize(); for( IndexType strip = 0; strip < numberOfStrips; strip++ ) { IndexType begin = strip * getWarpSize(); IndexType end = ( strip + 1 ) * getWarpSize(); if( this->getSize() < end ) end = this->getSize(); for( IndexType i = begin; i < end - 1; i++ ) { IndexType permIndex1, permIndex2; bool first = false; bool second = false; for( IndexType j = begin; j < end; j++ ) { if( this->rowPermArray.getElement( j ) == i ) { permIndex1 = j; first = true; } if( this->rowPermArray.getElement( j ) == i + 1 ) { permIndex2 = j; second = true; } } if( !first || !second ) std::cout << "Wrong permutation!" << std::endl; if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) ) continue; else ok = false; } } if( !ok ) throw( std::logic_error( "Segments permutaion verification failed." ) ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: verifyRowLengths( const SizesHolder& segmentsSizes ) { bool ok = true; for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ ) { BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments; const IndexType strip = segmentIdx / getWarpSize(); const IndexType stripLength = this->getStripLength( strip ); const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip; const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize(); const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength; IndexType elementPtr = begin; IndexType rowLength = 0; const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx ); for( IndexType group = 0; group < groupsCount; group++ ) { for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ ) { IndexType biElementPtr = elementPtr; for( IndexType j = 0; j < this->power( 2, group ); j++ ) { rowLength++; biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength; } elementPtr++; } } if( segmentsSizes.getElement( segmentIdx ) > rowLength ) ok = false; } if( ! ok ) throw( std::logic_error( "Segments capacities verification failed." ) ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > template< typename SizesHolder > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: setSegmentsSizes( const SizesHolder& segmentsSizes ) { //if( std::is_same< DeviceType, Devices::Host >::value ) // { this->size = segmentsSizes.getSize(); if( this->size % WarpSize != 0 ) this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() ); else this->virtualRows = this->size; IndexType strips = this->virtualRows / getWarpSize(); this->rowPermArray.setSize( this->size ); this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 ); this->groupPointers = 0; this->performRowBubbleSort( segmentsSizes ); this->computeColumnSizes( segmentsSizes ); this->groupPointers.template scan< Algorithms::ScanType::Exclusive >(); this->verifyRowPerm( segmentsSizes ); this->verifyRowLengths( segmentsSizes ); this->storageSize = getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) ); /*} else { BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments; Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes ); hostSegments.setSegmentsSizes( hostSegmentsSizes ); *this = hostSegments; } }*/ } template< typename Device, Loading Loading @@ -182,7 +403,7 @@ template< typename Device, bool RowMajorOrder, int WarpSize > __cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType { return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex( rowPermArray.getConstView(), Loading Loading @@ -308,11 +529,22 @@ template< typename Device, typename IndexAllocator, bool RowMajorOrder, int WarpSize > void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: printStructure( std::ostream& str ) auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getStripLength( const IndexType stripIdx ) const -> IndexType { return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx ); } template< typename Device, typename Index, typename IndexAllocator, bool RowMajorOrder, int WarpSize > auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >:: getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType { this->getView().printStructure( str ); return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 ) - this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group ); } } // namespace Segments Loading
src/TNL/Containers/Segments/BiEllpackSegmentView.h +39 −51 Original line number Diff line number Diff line Loading @@ -10,64 +10,42 @@ #pragma once #include <TNL/Containers/StaticVector.h> namespace TNL { namespace Containers { namespace Segments { template< typename Index, bool RowMajorOrder = false > class BiEllpackSegmentView; template< typename Index > class BiEllpackSegmentView< Index, false > bool RowMajorOrder = false, int WarpSize = 32 > class BiEllpackSegmentView { public: using IndexType = Index; __cuda_callable__ BiEllpackSegmentView( const IndexType offset, const IndexType size, const IndexType chunkSize, // this is only for compatibility with the following specialization const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17 : segmentOffset( offset ), segmentSize( size ){}; __cuda_callable__ BiEllpackSegmentView( const BiEllpackSegmentView& view ) : segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){}; __cuda_callable__ IndexType getSize() const { return this->segmentSize; }; __cuda_callable__ IndexType getGlobalIndex( const IndexType localIndex ) const { TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." ); return segmentOffset + localIndex; }; static constexpr int getWarpSize() { return WarpSize; }; protected: static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; IndexType segmentOffset, segmentSize; }; template< typename Index > class BiEllpackSegmentView< Index, true > { public: static constexpr int getGroupsCount() { return getLogWarpSize() + 1; }; using IndexType = Index; using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >; /** * \brief Constructor. * * \param offset is offset of the first group of the strip the segment belongs to. * \param size is the segment size * \param inStripIdx is index of the segment within its strip. * \param groupsWidth is a static vector containing widths of the strip groups */ __cuda_callable__ BiEllpackSegmentView( const IndexType offset, const IndexType size, const IndexType chunkSize, const IndexType chunksInSlice ) : segmentOffset( offset ), segmentSize( size ), chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){}; const IndexType inStripIdx, const GroupsWidthType& groupsWidth ) : groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){}; __cuda_callable__ IndexType getSize() const Loading @@ -76,17 +54,27 @@ class BiEllpackSegmentView< Index, true > }; __cuda_callable__ IndexType getGlobalIndex( const IndexType localIdx ) const IndexType getGlobalIndex( IndexType localIdx ) const { IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() ); while( localIdx > groupsWidth[ i ] ) { TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." ); const IndexType chunkIdx = localIdx / chunkSize; const IndexType inChunkOffset = localIdx % chunkSize; return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx; localIdx -= groupsWidth[ i ]; offset += groupsWidth[ i++ ] * groupHeight; groupHeight /= 2; } TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." ); if( RowMajorOrder ) return offset + inStripIdx * groupsWidth[ i ] + localIdx; else return offset + inStripIdx + localIdx * groupHeight; }; protected: IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice; IndexType groupOffset, inStripIdx, segmentSize; GroupsWidthType groupsWidth; }; } //namespace Segments Loading
src/TNL/Containers/Segments/BiEllpackView.h +1 −1 Original line number Diff line number Diff line Loading @@ -134,7 +134,7 @@ class BiEllpackView static constexpr int getWarpSize() { return WarpSize; }; static constexpr int getLogWarpSize() { return std::log( WarpSize ); }; static constexpr int getLogWarpSize() { return std::log2( WarpSize ); }; IndexType size = 0, storageSize = 0; Loading
src/TNL/Containers/Segments/BiEllpackView.hpp +3 −2 Original line number Diff line number Diff line Loading @@ -285,6 +285,7 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >:: segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const { using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType; } template< typename Device, Loading @@ -310,8 +311,8 @@ operator=( const BiEllpackView& source ) this->size = source.size; this->storageSize = source.storageSize; this->virtualRows = source.virtualRows; this->rowPermArray = source.rowPermArray; this->groupPointers = source.groupPointers; this->rowPermArray.bind( source.rowPermArray ); this->groupPointers.bind( source.groupPointers ); return *this; } Loading