Adaptive CSR kernels adapts to different Value/Real types. (0785c1da) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -65,6 +65,8 @@ struct CSRAdaptiveKernel

		static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };

		static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };

		static TNL::String getKernelType();

		template< typename Offsets >

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp

+2 −5

Original line number	Diff line number	Diff line
		@@ -165,7 +165,7 @@ initValueSize( const Offsets& offsets )
		if( type == details::Type::LONG )
		{
		const Index blocksCount = inBlocks.size();
		const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize() / TNL::Cuda::getWarpSize();
		const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< SizeOfValue >::CudaBlockSize() / TNL::Cuda::getWarpSize();
		Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
		if( warpsLeft == 0 )
		warpsLeft = warpsPerCudaBlock;
		@@ -182,10 +182,7 @@ initValueSize( const Offsets& offsets )
		start = nextStart;
		}
		inBlocks.emplace_back(nextStart);
		//std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
		TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
		TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
		this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
		this->blocksArray[ getSizeValueLog( SizeOfValue ) ] = inBlocks;
		}

		} // namespace Segments

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h

+4 −1

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@

		#include <TNL/Containers/Vector.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

		namespace TNL {
		namespace Algorithms {
		@@ -28,7 +29,9 @@ struct CSRAdaptiveKernelView
		using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
		using BlocksView = typename BlocksType::ViewType;

		static constexpr int MaxValueSizeLog = 6;
		static constexpr int MaxValueSizeLog = details::CSRAdaptiveKernelParameters<>::MaxValueSizeLog;

		static int getSizeValueLog( const int& i ) { return details::CSRAdaptiveKernelParameters<>::getSizeValueLog( i ); };

		CSRAdaptiveKernelView() = default;

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+5 −23

Original line number	Diff line number	Diff line
		@@ -113,32 +113,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		}
		else // blockType == Type::LONG - several warps per segment
		{
		// Number of elements processed by previous warps
		//const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
		/// block.getWarpIdx() * MAX_ELEM_PER_WARP;
		//Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index end = offsets[segmentIdx + 1];
		//const int tid = threadIdx.x;
		//const int inBlockWarpIdx = block.getWarpIdx();

		//if( to > end )
		// to = end;
		TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
		result = zero;
		//printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() );
		for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
		globalIdx < end;
		globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
		{
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
		//if( laneIdx == 0 )
		// printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
		// details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
		//result += values[i] * inVector[columnIndexes[i]];
		}
		//printf( "tid %d -> %d \n", tid, result );

		result += __shfl_down_sync(0xFFFFFFFF, result, 16);
		result += __shfl_down_sync(0xFFFFFFFF, result, 8);
		@@ -146,9 +131,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		result += __shfl_down_sync(0xFFFFFFFF, result, 2);
		result += __shfl_down_sync(0xFFFFFFFF, result, 1);

		//if( laneIdx == 0 )
		// printf( "WARP RESULT: tid %d -> %d \n", tid, result );

		const Index warpID = threadIdx.x / 32;
		if( laneIdx == 0 )
		multivectorShared[ warpID ] = result;
		@@ -249,9 +231,9 @@ segmentsReduction( const OffsetsView& offsets,
		Args... args ) const
		{
		#ifdef HAVE_CUDA
		int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );
		int valueSizeLog = getSizeValueLog( sizeof( Real ) );

		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() \|\| valueSizeLog > MaxValueSizeLog )
		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() \|\| valueSizeLog >= MaxValueSizeLog )
		{
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		@@ -261,7 +243,7 @@ segmentsReduction( const OffsetsView& offsets,
		Index blocksCount;

		const Index threads = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
		constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize(); //2147483647;
		constexpr size_t maxGridSize = TNL::Cuda::getMaxGridSize();

		// Fill blocks
		size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * TNL::Cuda::getWarpSize(); // one warp per block

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

+38 −3

Original line number	Diff line number	Diff line
		@@ -15,17 +15,26 @@ namespace TNL {
		namespace Segments {
		namespace details {

		template< int SizeOfValue,
		static constexpr int CSRAdaptiveKernelParametersCudaBlockSizes[] = { 256, 256, 256, 128, 128, 128 };

		template< int SizeOfValue = 1,
		int StreamedSharedMemory_ = 24576 >
		struct CSRAdaptiveKernelParameters
		{
		static constexpr int MaxValueSizeLog = 6;

		static constexpr int getSizeValueLogConstexpr( const int i );

		static constexpr int SizeOfValueLog = getSizeValueLogConstexpr( SizeOfValue );
		static_assert( SizeOfValueLog < MaxValueSizeLog, "Parameter SizeOfValue is too large." );

		/**
		* \brief Computes number of CUDA threads per block depending on Value type.
		*
		* \return CUDA block size.
		*/
		static constexpr int CudaBlockSize() { return 128; }; //sizeof( Value ) == 8 ? 128 : 256; };
		//std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };
		static constexpr int CudaBlockSize() { return CSRAdaptiveKernelParametersCudaBlockSizes[ SizeOfValueLog ]; };
		//{ return SizeOfValue == 8 ? 128 : 256; };

		/**
		* \brief Returns amount of shared memory dedicated for stream CSR kernel.
		@@ -64,6 +73,32 @@ struct CSRAdaptiveKernelParameters
		* \return Maximum number of elements per warp for adaptive kernel.
		*/
		static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };

		static int getSizeValueLog( const int i )
		{
		if( i == 1 ) return 0;
		if( i == 2 ) return 1;
		if( i <= 4 ) return 2;
		if( i <= 8 ) return 3;
		if( i <= 16 ) return 4;
		return 5;
		}
		};


		template< int SizeOfValue,
		int StreamedSharedMemory_ >
		constexpr int
		CSRAdaptiveKernelParameters< SizeOfValue, StreamedSharedMemory_ >::
		getSizeValueLogConstexpr( const int i )
		{
		if( i == 1 ) return 0;
		if( i == 2 ) return 1;
		if( i <= 4 ) return 2;
		if( i <= 8 ) return 3;
		if( i <= 16 ) return 4;
		if( i <= 32 ) return 5;
		return 6;
		};

		} // namespace details