Added new CSR adaptive kernel block descriptor. (cadfb88a) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRKernelAdaptive.h

+10 −9

Original line number	Diff line number	Diff line
		@@ -113,11 +113,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		else // blockType == Type::LONG - several warps per segment
		{
		// Number of elements processed by previous warps
		const Index offset = block.index[1] * MAX_ELEM_PER_WARP;
		Index to = begin + (block.index[1] + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.index[0];
		const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
		block.getWarpIdx() * MAX_ELEM_PER_WARP;
		Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index end = offsets[block.index[0] + 1];
		const Index end = offsets[segmentIdx + 1];
		const int tid = threadIdx.x;

		if( to > end )
		@@ -215,7 +216,7 @@ struct CSRKernelAdaptiveView
		return;
		}

		//this->printBlocks();
		this->printBlocks();
		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		//static constexpr Index THREADS_SCALAR = 128;
		//static constexpr Index THREADS_VECTOR = 128;
		@@ -390,15 +391,15 @@ struct CSRKernelAdaptive

		if( type == details::Type::LONG )
		{
		inBlock.emplace_back( start, details::Type::LONG, 0 );
		const Index blocksCount = inBlock.size();
		const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
		const Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
		//Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
		/*for( Index index = 1; index < warpsLeft; index++ )
		inBlock.emplace_back( start, details::Type::LONG, 0, warpsLeft );
		for( Index index = 1; index < warpsLeft; index++ )
		{
		inBlock.emplace_back(start, Type::LONG, index);
		}*/
		inBlock.emplace_back( start, details::Type::LONG, index, warpsLeft );
		}
		}
		else
		{

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h

+51 −21

Original line number	Diff line number	Diff line
		@@ -108,18 +108,35 @@ union CSRAdaptiveKernelBlockDescriptor
		#else

		template< typename Index >
		union CSRAdaptiveKernelBlockDescriptor
		{
		CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
		{
		this->index[0] = row;
		struct CSRAdaptiveKernelBlockDescriptor
		{
		CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
		Type type = Type::VECTOR,
		uint8_t warpIdx = 0,
		uint8_t warpsCount = 0 ) noexcept
		{
		this->firstSegmentIdx = firstSegmentIdx;
		this->type = ( uint8_t ) type;
		this->warpIdx = warpIdx;
		this->warpsCount = warpsCount;
		/*this->index[0] = row;
		this->index[1] = index;
		this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;
		this->byte[sizeof(Index) == 4 ? 7 : 15] = (uint8_t)type;*/
		}

		CSRAdaptiveKernelBlockDescriptor(Index row, Type type, Index nextRow, Index maxID, Index minID) noexcept
		CSRAdaptiveKernelBlockDescriptor( Index firstSegmentIdx,
		Type type,
		Index lastSegmentIdx,
		Index end,
		Index begin ) noexcept
		{
		this->index[0] = row;
		this->firstSegmentIdx = firstSegmentIdx;
		this->warpIdx = 0;
		this->blockSize = end - begin;
		this->segmentsInBlock = lastSegmentIdx - firstSegmentIdx;
		this->type = ( uint8_t ) type;

		/*this->index[0] = row;
		this->index[1] = 0;
		this->twobytes[sizeof(Index) == 4 ? 2 : 4] = maxID - minID;

		@@ -129,23 +146,25 @@ union CSRAdaptiveKernelBlockDescriptor
		if (type == Type::STREAM)
		this->byte[sizeof(Index) == 4 ? 7 : 15] \|= 0b1000000;
		else if (type == Type::VECTOR)
		this->byte[sizeof(Index) == 4 ? 7 : 15] \|= 0b10000000;
		this->byte[sizeof(Index) == 4 ? 7 : 15] \|= 0b10000000;*/
		}

		CSRAdaptiveKernelBlockDescriptor() = default;

		__cuda_callable__ Type getType() const
		{
		if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
		return ( Type ) this->type;
		/*if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b1000000 )
		return Type::STREAM;
		if( byte[ sizeof( Index ) == 4 ? 7 : 15 ] & 0b10000000 )
		return Type::VECTOR;
		return Type::LONG;
		return Type::LONG;*/
		}

		__cuda_callable__ const Index& getFirstSegment() const
		{
		return index[ 0 ];
		return this->firstSegmentIdx;
		//return index[ 0 ];
		}

		/***
		@@ -153,7 +172,8 @@ union CSRAdaptiveKernelBlockDescriptor
		*/
		__cuda_callable__ const Index getSize() const
		{
		return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
		return this->blockSize;
		//return twobytes[ sizeof(Index) == 4 ? 2 : 4 ];
		}

		/***
		@@ -161,14 +181,19 @@ union CSRAdaptiveKernelBlockDescriptor
		*/
		__cuda_callable__ const Index getSegmentsInBlock() const
		{
		return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
		return this->segmentsInBlock;
		//return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
		}

		__cuda_callable__ const uint8_t getWarpIdx() const
		{
		return this->warpIdx;
		}

		void print( std::ostream& str ) const
		{
		Type type = this->getType();
		str << "Type: ";
		switch( type )
		switch( this->getType() )
		{
		case Type::STREAM:
		str << " Stream ";
		@@ -180,13 +205,18 @@ union CSRAdaptiveKernelBlockDescriptor
		str << " Long ";
		break;
		}
		str << " first segment: " << getFirstSegment();
		str << " block end: " << getSize();
		str << " index in warp: " << index[ 1 ];
		str << " first segment: " << this->getFirstSegment();
		str << " block end: " << this->getSize();
		str << " index in warp: " << this->getWarpIdx();
		}
		Index index[2]; // index[0] is row pointer, index[1] is index in warp
		uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
		uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID

		uint8_t type;
		Index firstSegmentIdx, blockSize, segmentsInBlock;
		uint8_t warpIdx, warpsCount;

		//Index index[2]; // index[0] is row pointer, index[1] is index in warp
		//uint8_t byte[sizeof(Index) == 4 ? 8 : 16]; // byte[7/15] is type specificator
		//uint16_t twobytes[sizeof(Index) == 4 ? 4 : 8]; //twobytes[2/4] is maxID - minID
		//twobytes[3/5] is nextRow - row
		};