Implementing BiEllpack segments. (c3bf6918) · Commits · TNL / tnl-dev

src/TNL/Containers/Segments/BiEllpack.h

+33 −2

Original line number	Diff line number	Diff line
		@@ -75,7 +75,7 @@ class BiEllpack
		IndexType getStorageSize() const;

		__cuda_callable__
		IndexType getGlobalIndex( const Index segmentIdx, const Index localIdx ) const;
		IndexType getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const;

		__cuda_callable__
		SegmentViewType getSegmentView( const IndexType segmentIdx ) const;
		@@ -117,7 +117,23 @@ class BiEllpack

		static constexpr int getWarpSize() { return WarpSize; };

		static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
		static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

		template< typename SizesHolder = OffsetsHolder >
		void performRowBubbleSort( const SizesHolder& segmentsSize );

		template< typename SizesHolder = OffsetsHolder >
		void computeColumnSizes( const SizesHolder& segmentsSizes );

		template< typename SizesHolder = OffsetsHolder >
		void verifyRowPerm( const SizesHolder& segmentsSizes );

		template< typename SizesHolder = OffsetsHolder >
		void verifyRowLengths( const SizesHolder& segmentsSizes );

		IndexType getStripLength( const IndexType stripIdx ) const;

		IndexType getGroupLength( const IndexType strip, const IndexType group ) const;

		IndexType size = 0, storageSize = 0;

		@@ -127,6 +143,21 @@ class BiEllpack

		OffsetsHolder groupPointers;



		// TODO: Replace later
		__cuda_callable__ Index power( const IndexType number, const IndexType exponent ) const
		{
		if( exponent >= 0 )
		{
		IndexType result = 1;
		for( IndexType i = 0; i < exponent; i++ )
		result *= number;
		return result;
		}
		return 0;
		};

		template< typename Device_, typename Index_, typename IndexAllocator_, bool RowMajorOrder_, int WarpSize_ >
		friend class BiEllpack;
		};

src/TNL/Containers/Segments/BiEllpack.hpp

+242 −10

Original line number	Diff line number	Diff line
		@@ -10,6 +10,7 @@

		#pragma once

		#include <math.h>
		#include <TNL/Containers/Vector.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Containers/Segments/BiEllpack.h>
		@@ -113,20 +114,240 @@ template< typename Device,
		bool RowMajorOrder,
		int WarpSize >
		template< typename SizesHolder >
		void
		BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		setSegmentsSizes( const SizesHolder& segmentsSizes )
		void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		performRowBubbleSort( const SizesHolder& segmentsSizes )
		{
		this->rowPermArray.evaluate( [] __cuda_callable__ ( const IndexType i ) -> IndexType { return i; } );

		if( std::is_same< DeviceType, Devices::Host >::value )
		{
		IndexType strips = this->virtualRows / getWarpSize();
		for( IndexType i = 0; i < strips; i++ )
		{
		IndexType begin = i * getWarpSize();
		IndexType end = ( i + 1 ) * getWarpSize() - 1;
		if(this->getSize() - 1 < end)
		end = this->getSize() - 1;
		bool sorted = false;
		IndexType permIndex1, permIndex2, offset = 0;
		while( !sorted )
		{
		sorted = true;
		for( IndexType j = begin + offset; j < end - offset; j++ )
		{
		for( IndexType k = begin; k < end + 1; k++ )
		{
		if( this->rowPermArray.getElement( k ) == j )
		permIndex1 = k;
		if( this->rowPermArray.getElement( k ) == j + 1 )
		permIndex2 = k;
		}
		if( segmentsSizes.getElement( permIndex1 ) < segmentsSizes.getElement( permIndex2 ) )
		{
		IndexType temp = this->rowPermArray.getElement( permIndex1 );
		this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
		this->rowPermArray.setElement( permIndex2, temp );
		sorted = false;
		}
		}
		for( IndexType j = end - 1 - offset; j > begin + offset; j-- )
		{
		for( IndexType k = begin; k < end + 1; k++ )
		{
		if( this->rowPermArray.getElement( k ) == j )
		permIndex1 = k;
		if( this->rowPermArray.getElement( k ) == j - 1 )
		permIndex2 = k;
		}
		if( segmentsSizes.getElement( permIndex2 ) < segmentsSizes.getElement( permIndex1 ) )
		{
		IndexType temp = this->rowPermArray.getElement( permIndex1 );
		this->rowPermArray.setElement( permIndex1, this->rowPermArray.getElement( permIndex2 ) );
		this->rowPermArray.setElement( permIndex2, temp );
		sorted = false;
		}
		}
		offset++;
		}
		}
		}
		}

		template< typename Device,
		typename Index,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		template< typename SizesHolder >
		void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		computeColumnSizes( const SizesHolder& segmentsSizes )
		{
		IndexType numberOfStrips = this->virtualRows / getWarpSize();
		auto groupPointersView = this->groupPointers.getView();
		auto segmentsPermutationView = this->rowPermArray.getView();
		auto segmentsSizesView = segmentsSizes.getConstView();
		const IndexType size = this->getSize();
		Algorithms::ParallelFor< DeviceType >::exec(
		( IndexType ) 0,
		this->virtualRows / getWarpSize(),
		[=] __cuda_callable__ ( const IndexType strip ) mutable {

		IndexType firstSegment = strip * getWarpSize();
		IndexType groupBegin = strip * ( getLogWarpSize() + 1 );
		IndexType emptyGroups = 0;

		////
		// The last strip can be shorter
		if( strip == numberOfStrips - 1 )
		{
		IndexType segmentsCount = size - firstSegment;
		while( !( segmentsCount > TNL::pow( getLogWarpSize() - 1 - emptyGroups, 2 ) ) )
		emptyGroups++;
		for( IndexType group = groupBegin; group < groupBegin + emptyGroups; group++ )
		groupPointersView[ group ] = 0;
		}

		IndexType allocatedColumns = 0;
		for( IndexType groupIdx = emptyGroups; groupIdx < getLogWarpSize(); groupIdx++ )
		{
		IndexType segmentIdx = TNL::pow( getLogWarpSize() - 1 - groupIdx, 2 );
		IndexType permSegm = 0;
		while( segmentsPermutationView[ permSegm + firstSegment ] != segmentIdx + firstSegment )
		permSegm++;
		const IndexType groupWidth = segmentsSizesView[ permSegm + firstSegment ] - allocatedColumns;
		const IndexType groupHeight = TNL::pow( getLogWarpSize() - groupIdx, 2 );
		const IndexType groupSize = groupWidth * groupHeight;
		allocatedColumns = segmentsSizes[ permSegm + firstSegment ];
		groupPointersView[ groupIdx + groupBegin ] = groupSize;
		}
		} );
		}

		template< typename Device,
		typename Index,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		template< typename SizesHolder >
		void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		verifyRowPerm( const SizesHolder& segmentsSizes )
		{
		bool ok = true;
		IndexType numberOfStrips = this->virtualRows / getWarpSize();
		for( IndexType strip = 0; strip < numberOfStrips; strip++ )
		{
		IndexType begin = strip * getWarpSize();
		IndexType end = ( strip + 1 ) * getWarpSize();
		if( this->getSize() < end )
		end = this->getSize();
		for( IndexType i = begin; i < end - 1; i++ )
		{
		IndexType permIndex1, permIndex2;
		bool first = false;
		bool second = false;
		for( IndexType j = begin; j < end; j++ )
		{
		if( this->rowPermArray.getElement( j ) == i )
		{
		permIndex1 = j;
		first = true;
		}
		if( this->rowPermArray.getElement( j ) == i + 1 )
		{
		permIndex2 = j;
		second = true;
		}
		}
		if( !first \|\| !second )
		std::cout << "Wrong permutation!" << std::endl;
		if( segmentsSizes.getElement( permIndex1 ) >= segmentsSizes.getElement( permIndex2 ) )
		continue;
		else
		ok = false;
		}
		}
		if( !ok )
		throw( std::logic_error( "Segments permutaion verification failed." ) );
		}

		template< typename Device,
		typename Index,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		template< typename SizesHolder >
		void BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		verifyRowLengths( const SizesHolder& segmentsSizes )
		{
		bool ok = true;
		for( IndexType segmentIdx = 0; segmentIdx < this->getSize(); segmentIdx++ )
		{
		BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< Index >, RowMajorOrder > hostSegments;
		const IndexType strip = segmentIdx / getWarpSize();
		const IndexType stripLength = this->getStripLength( strip );
		const IndexType groupBegin = ( getLogWarpSize() + 1 ) * strip;
		const IndexType rowStripPerm = this->rowPermArray.getElement( segmentIdx ) - strip * getWarpSize();
		const IndexType begin = this->groupPointers.getElement( groupBegin ) * getWarpSize() + rowStripPerm * stripLength;
		IndexType elementPtr = begin;
		IndexType rowLength = 0;
		const IndexType groupsCount = details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getActiveGroupsCount( this->rowPermArray.getConstView(), segmentIdx );
		for( IndexType group = 0; group < groupsCount; group++ )
		{
		for( IndexType i = 0; i < this->getGroupLength( strip, group ); i++ )
		{
		IndexType biElementPtr = elementPtr;
		for( IndexType j = 0; j < this->power( 2, group ); j++ )
		{
		rowLength++;
		biElementPtr += this->power( 2, getLogWarpSize() - group ) * stripLength;
		}
		elementPtr++;
		}
		}
		if( segmentsSizes.getElement( segmentIdx ) > rowLength )
		ok = false;
		}
		if( ! ok )
		throw( std::logic_error( "Segments capacities verification failed." ) );
		}

		template< typename Device,
		typename Index,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		template< typename SizesHolder >
		void
		BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		setSegmentsSizes( const SizesHolder& segmentsSizes )
		{
		//if( std::is_same< DeviceType, Devices::Host >::value )
		// {
		this->size = segmentsSizes.getSize();
		if( this->size % WarpSize != 0 )
		this->virtualRows = this->size + getWarpSize() - ( this->size % getWarpSize() );
		else
		this->virtualRows = this->size;
		IndexType strips = this->virtualRows / getWarpSize();
		this->rowPermArray.setSize( this->size );
		this->groupPointers.setSize( strips * ( getLogWarpSize() + 1 ) + 1 );
		this->groupPointers = 0;

		this->performRowBubbleSort( segmentsSizes );
		this->computeColumnSizes( segmentsSizes );

		this->groupPointers.template scan< Algorithms::ScanType::Exclusive >();

		this->verifyRowPerm( segmentsSizes );
		this->verifyRowLengths( segmentsSizes );
		this->storageSize = getWarpSize() * this->groupPointers.getElement( strips * ( getLogWarpSize() + 1 ) );
		/*}
		else
		{
		BiEllpack< Devices::Host, Index, typename Allocators::Default< Devices::Host >::template Allocator< IndexType >, RowMajorOrder > hostSegments;
		Containers::Vector< IndexType, Devices::Host, IndexType > hostSegmentsSizes( segmentsSizes );
		hostSegments.setSegmentsSizes( hostSegmentsSizes );
		*this = hostSegments;
		}
		}*/
		}

		template< typename Device,
		@@ -182,7 +403,7 @@ template< typename Device,
		bool RowMajorOrder,
		int WarpSize >
		__cuda_callable__ auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		getGlobalIndex( const Index segmentIdx, const Index localIdx ) const -> IndexType
		getGlobalIndex( const IndexType segmentIdx, const IndexType localIdx ) const -> IndexType
		{
		return details::BiEllpack< IndexType, DeviceType, RowMajorOrder >::getGlobalIndex(
		rowPermArray.getConstView(),
		@@ -308,11 +529,22 @@ template< typename Device,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		void
		BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		printStructure( std::ostream& str )
		auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		getStripLength( const IndexType stripIdx ) const -> IndexType
		{
		return details::BiEllpack< Index, Device, RowMajorOrder, WarpSize >::getStripLength( this->groupPointers.getConstView(), stripIdx );
		}

		template< typename Device,
		typename Index,
		typename IndexAllocator,
		bool RowMajorOrder,
		int WarpSize >
		auto BiEllpack< Device, Index, IndexAllocator, RowMajorOrder, WarpSize >::
		getGroupLength( const IndexType strip, const IndexType group ) const -> IndexType
		{
		this->getView().printStructure( str );
		return this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group + 1 )
		- this->groupPointers.getElement( strip * ( getLogWarpSize() + 1 ) + group );
		}

		} // namespace Segments

src/TNL/Containers/Segments/BiEllpackSegmentView.h

+39 −51

Original line number	Diff line number	Diff line
		@@ -10,64 +10,42 @@

		#pragma once

		#include <TNL/Containers/StaticVector.h>

		namespace TNL {
		namespace Containers {
		namespace Segments {

		template< typename Index,
		bool RowMajorOrder = false >
		class BiEllpackSegmentView;

		template< typename Index >
		class BiEllpackSegmentView< Index, false >
		bool RowMajorOrder = false,
		int WarpSize = 32 >
		class BiEllpackSegmentView
		{
		public:

		using IndexType = Index;

		__cuda_callable__
		BiEllpackSegmentView( const IndexType offset,
		const IndexType size,
		const IndexType chunkSize, // this is only for compatibility with the following specialization
		const IndexType chunksInSlice ) // this one as well - both can be replaced when we could use constexprif in C++17
		: segmentOffset( offset ), segmentSize( size ){};

		__cuda_callable__
		BiEllpackSegmentView( const BiEllpackSegmentView& view )
		: segmentOffset( view.segmentOffset ), segmentSize( view.segmentSize ){};

		__cuda_callable__
		IndexType getSize() const
		{
		return this->segmentSize;
		};

		__cuda_callable__
		IndexType getGlobalIndex( const IndexType localIndex ) const
		{
		TNL_ASSERT_LT( localIndex, segmentSize, "Local index exceeds segment bounds." );
		return segmentOffset + localIndex;
		};
		static constexpr int getWarpSize() { return WarpSize; };

		protected:
		static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

		IndexType segmentOffset, segmentSize;
		};

		template< typename Index >
		class BiEllpackSegmentView< Index, true >
		{
		public:
		static constexpr int getGroupsCount() { return getLogWarpSize() + 1; };

		using IndexType = Index;
		using GroupsWidthType = Containers::StaticVector< getGroupsCount(), IndexType >;


		/**
		* \brief Constructor.
		*
		* \param offset is offset of the first group of the strip the segment belongs to.
		* \param size is the segment size
		* \param inStripIdx is index of the segment within its strip.
		* \param groupsWidth is a static vector containing widths of the strip groups
		*/
		__cuda_callable__
		BiEllpackSegmentView( const IndexType offset,
		const IndexType size,
		const IndexType chunkSize,
		const IndexType chunksInSlice )
		: segmentOffset( offset ), segmentSize( size ),
		chunkSize( chunkSize ), chunksInSlice( chunksInSlice ){};
		const IndexType inStripIdx,
		const GroupsWidthType& groupsWidth )
		: groupOffset( offset ), segmentSize( TNL::sum( groupsWidth ) ), inStripIdx( inStripIdx ), groupsWidth( groupsWidth ){};

		__cuda_callable__
		IndexType getSize() const
		@@ -76,17 +54,27 @@ class BiEllpackSegmentView< Index, true >
		};

		__cuda_callable__
		IndexType getGlobalIndex( const IndexType localIdx ) const
		IndexType getGlobalIndex( IndexType localIdx ) const
		{
		IndexType i( 0 ), offset( groupOffset ), groupHeight( getWarpSize() );
		while( localIdx > groupsWidth[ i ] )
		{
		TNL_ASSERT_LT( localIdx, segmentSize, "Local index exceeds segment bounds." );
		const IndexType chunkIdx = localIdx / chunkSize;
		const IndexType inChunkOffset = localIdx % chunkSize;
		return segmentOffset + inChunkOffset * chunksInSlice + chunkIdx;
		localIdx -= groupsWidth[ i ];
		offset += groupsWidth[ i++ ] * groupHeight;
		groupHeight /= 2;
		}
		TNL_ASSERT_LE( i, TNL::log2( getWarpSize() - inStripIdx + 1 ), "Local index exceeds segment bounds." );
		if( RowMajorOrder )
		return offset + inStripIdx * groupsWidth[ i ] + localIdx;
		else
		return offset + inStripIdx + localIdx * groupHeight;
		};

		protected:

		IndexType segmentOffset, segmentSize, chunkSize, chunksInSlice;
		IndexType groupOffset, inStripIdx, segmentSize;

		GroupsWidthType groupsWidth;
		};

		} //namespace Segments

src/TNL/Containers/Segments/BiEllpackView.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -134,7 +134,7 @@ class BiEllpackView

		static constexpr int getWarpSize() { return WarpSize; };

		static constexpr int getLogWarpSize() { return std::log( WarpSize ); };
		static constexpr int getLogWarpSize() { return std::log2( WarpSize ); };

		IndexType size = 0, storageSize = 0;

src/TNL/Containers/Segments/BiEllpackView.hpp

+3 −2

Original line number	Diff line number	Diff line
		@@ -285,6 +285,7 @@ BiEllpackView< Device, Index, RowMajorOrder, WarpSize >::
		segmentsReduction( IndexType first, IndexType last, Fetch& fetch, Reduction& reduction, ResultKeeper& keeper, const Real& zero, Args... args ) const
		{
		using RealType = typename details::FetchLambdaAdapter< Index, Fetch >::ReturnType;

		}

		template< typename Device,
		@@ -310,8 +311,8 @@ operator=( const BiEllpackView& source )
		this->size = source.size;
		this->storageSize = source.storageSize;
		this->virtualRows = source.virtualRows;
		this->rowPermArray = source.rowPermArray;
		this->groupPointers = source.groupPointers;
		this->rowPermArray.bind( source.rowPermArray );
		this->groupPointers.bind( source.groupPointers );
		return *this;
		}