Refactoring of adaptive CSR kernel. (936f2c32) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+11 −12

Original line number	Diff line number	Diff line
		@@ -53,16 +53,16 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();

		__shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
		//__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
		//__shared__ BlockType sharedBlocks[ WarpsCount ];
		__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];
		__shared__ BlockType sharedBlocks[ WarpsCount ];

		const Index index = ( ( gridIdx * TNL::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x ) + threadIdx.x;
		const Index blockIdx = index / WarpSize;
		if( blockIdx >= blocks.getSize() - 1 )
		return;

		//if( threadIdx.x < CudaBlockSize / WarpSize )
		// multivectorShared[ threadIdx.x ] = zero;
		if( threadIdx.x < CudaBlockSize / WarpSize )
		multivectorShared[ threadIdx.x ] = zero;
		Real result = zero;
		bool compute( true );
		const Index laneIdx = threadIdx.x & 31; // & is cheaper than %
		@@ -71,7 +71,8 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		__syncthreads();
		const auto& block = sharedBlocks[ warpIdx ];*/
		const BlockType block = blocks[ blockIdx ];
		const Index begin = offsets[ block.getFirstSegment() ];
		const Index firstSegmentIdx = block.getFirstSegment();
		const Index begin = offsets[ firstSegmentIdx ];

		if( block.getType() == details::Type::STREAM ) // Stream kernel - many short segments per warp
		{
		@@ -80,12 +81,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,

		// Stream data to shared memory
		for( Index globalIdx = laneIdx + begin; globalIdx < end; globalIdx += WarpSize )
		{
		streamShared[ warpIdx ][ globalIdx - begin ] = fetch( globalIdx, compute );
		}
		//const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();
		const Index lastSegmentIdx = firstSegmentIdx + block.getSegmentsInBlock();

		/*for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
		for( Index i = firstSegmentIdx + laneIdx; i < lastSegmentIdx; i += WarpSize )
		{
		const Index sharedEnd = offsets[ i + 1 ] - begin; // end of preprocessed data
		result = zero;
		@@ -93,9 +92,9 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		for( Index sharedIdx = offsets[ i ] - begin; sharedIdx < sharedEnd; sharedIdx++ )
		result = reduce( result, streamShared[ warpIdx ][ sharedIdx ] );
		keep( i, result );
		}*/
		}
		/*else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
		}
		else if( block.getType() == details::Type::VECTOR ) // Vector kernel - one segment per warp
		{
		const Index end = begin + block.getSize();
		const Index segmentIdx = block.getFirstSegment();
		@@ -172,7 +171,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		keep( segmentIdx, multivectorShared[ 0 ] );
		}
		}
		}*/
		}
		}
		#endif

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h

+13 −1

Original line number	Diff line number	Diff line
		@@ -22,11 +22,13 @@ enum class Type {
		VECTOR = 2
		};

		//#define CSR_ADAPTIVE_UNION

		#ifdef CSR_ADAPTIVE_UNION
		template< typename Index >
		union CSRAdaptiveKernelBlockDescriptor
		{
		CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0) noexcept
		CSRAdaptiveKernelBlockDescriptor(Index row, Type type = Type::VECTOR, Index index = 0, uint8_t warpsCount = 0) noexcept
		{
		this->index[0] = row;
		this->index[1] = index;
		@@ -80,6 +82,16 @@ union CSRAdaptiveKernelBlockDescriptor
		return ( twobytes[ sizeof( Index ) == 4 ? 3 : 5 ] & 0x3FFF );
		}

		__cuda_callable__ uint8_t getWarpIdx() const
		{
		return index[ 1 ];
		}

		__cuda_callable__ uint8_t getWarpsCount() const
		{
		return 1;
		}

		void print( std::ostream& str ) const
		{
		Type type = this->getType();

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

+4 −1

Original line number	Diff line number	Diff line
		@@ -15,7 +15,7 @@ namespace TNL {
		namespace Segments {
		namespace details {

		static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };
		static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 256, 256, 256 };

		template< int SizeOfValue = 1,
		int StreamedSharedMemory_ = 24576 >
		@@ -25,7 +25,10 @@ struct CSRAdaptiveKernelParameters

		static constexpr int getSizeValueLogConstexpr( const int i );

		static constexpr int getSizeOfValue() { return SizeOfValue; };

		static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );

		static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );

		/**