And now the real optimization of the Adaptive CSR kernel initiation :). (fc12a48d) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRKernelAdaptive.h

+45 −50

Original line number	Diff line number	Diff line
		@@ -119,7 +119,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index end = offsets[segmentIdx + 1];
		const int tid = threadIdx.x;
		//const int tid = threadIdx.x;
		//const int inBlockWarpIdx = block.getWarpIdx();

		//if( to > end )
		@@ -382,11 +382,9 @@ struct CSRKernelAdaptive
		Index &sum )
		{
		sum = 0;
		TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >
		hostOffsets( offsets );
		for (Index current = start; current < size - 1; current++ )
		{
		Index elements = hostOffsets[ current + 1 ] - hostOffsets[ current ];
		Index elements = offsets[ current + 1 ] - offsets[ current ];
		sum += elements;
		if( sum > SHARED_PER_WARP )
		{
		@@ -401,7 +399,6 @@ struct CSRKernelAdaptive
		type = details::Type::VECTOR;
		else
		type = details::Type::LONG;
		//type = Type::LONG; //
		return current + 1;
		}
		}
		@@ -413,6 +410,8 @@ struct CSRKernelAdaptive
		template< typename Offsets >
		void init( const Offsets& offsets )
		{
		using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
		HostOffsetsType hostOffsets( offsets );
		const Index rows = offsets.getSize();
		Index sum, start( 0 ), nextStart( 0 );

		@@ -423,7 +422,7 @@ struct CSRKernelAdaptive
		while( nextStart != rows - 1 )
		{
		details::Type type;
		nextStart = findLimit( start, offsets, rows, type, sum );
		nextStart = findLimit( start, hostOffsets, rows, type, sum );

		if( type == details::Type::LONG )
		{
		@@ -432,13 +431,9 @@ struct CSRKernelAdaptive
		Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
		if( warpsLeft == 0 )
		warpsLeft = warpsPerCudaBlock;
		//Index parts = roundUpDivision(sum, this->SHARED_PER_WARP);
		inBlocks.emplace_back( start, details::Type::LONG, 0, warpsLeft );
		for( Index index = 1; index < warpsLeft; index++ )
		{
		for( Index index = 0; index < warpsLeft; index++ )
		inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
		}
		}
		else
		{
		inBlocks.emplace_back(start, type,
		@@ -451,7 +446,7 @@ struct CSRKernelAdaptive
		inBlocks.emplace_back(nextStart);
		this->blocks = inBlocks;
		this->view.setBlocks( blocks );
		};
		}

		void reset()
		{