Added blocks in CSR adaptive kernel for different Value/Real type sizes. (440c35b9) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+20 −11

Original line number	Diff line number	Diff line
		@@ -63,13 +63,15 @@ struct CSRAdaptiveKernel
		using BlocksType = typename ViewType::BlocksType;
		using BlocksView = typename BlocksType::ViewType;

		static constexpr int MaxValueSizeLog() { return ViewType::MaxValueSizeLog; };

		static TNL::String getKernelType();


		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

		// How many shared memory use per block in CSR Adaptive kernel
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::StreamedSharedMemory(); //20000; //24576; TODO:

		// Number of elements in shared memory
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);
		@@ -84,14 +86,7 @@ struct CSRAdaptiveKernel
		static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		// Max length of row to process one warp for CSR Adaptive
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();

		template< typename Offsets >
		Index findLimit( const Index start,
		const Offsets& offsets,
		const Index size,
		details::Type &type,
		Index &sum );
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Index ) >::MaxAdaptiveElementsPerWarp();

		template< typename Offsets >
		void init( const Offsets& offsets );
		@@ -118,7 +113,21 @@ struct CSRAdaptiveKernel
		Args... args ) const;

		protected:
		BlocksType blocks;
		template< int SizeOfValue, typename Offsets >
		Index findLimit( const Index start,
		const Offsets& offsets,
		const Index size,
		details::Type &type,
		Index &sum );

		template< int SizeOfValue,
		typename Offsets >
		void initValueSize( const Offsets& offsets );

		/**
		* \brief blocksArray[ i ] stores blocks for sizeof( Value ) == 2^i.
		*/
		BlocksType blocksArray[ MaxValueSizeLog() ];

		ViewType view;
		};

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp

+83 −59

Original line number	Diff line number	Diff line
		@@ -31,10 +31,81 @@ getKernelType()
		return ViewType::getKernelType();
		};


		template< typename Index,
		typename Device >
		template< typename Offsets >
		void
		CSRAdaptiveKernel< Index, Device >::
		init( const Offsets& offsets )
		{
		this->template initValueSize< 1 >( offsets );
		this->template initValueSize< 2 >( offsets );
		this->template initValueSize< 4 >( offsets );
		this->template initValueSize< 8 >( offsets );
		this->template initValueSize< 16 >( offsets );
		this->template initValueSize< 32 >( offsets );
		for( int i = 0; i < MaxValueSizeLog(); i++ )
		this->view.setBlocks( blocksArray[ i ], i );
		}


		template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernel< Index, Device >::
		reset()
		{
		for( int i = 0; i < MaxValueSizeLog(); i++ )
		{
		this->blocksArray[ i ].reset();
		this->view.setBlocks( this->blocksArray[ i ], i );
		}
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getView() -> ViewType
		{
		return this->view;
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getConstView() const -> ConstViewType
		{
		return this->view;
		};

		template< typename Index,
		typename Device >
		template< typename OffsetsView,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		CSRAdaptiveKernel< Index, Device >::
		segmentsReduction( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		{
		view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		}

		template< typename Index,
		typename Device >
		template< int SizeOfValue,
		typename Offsets >
		Index
		CSRAdaptiveKernel< Index, Device >::
		findLimit( const Index start,
		@@ -48,7 +119,7 @@ findLimit( const Index start,
		{
		Index elements = offsets[ current + 1 ] - offsets[ current ];
		sum += elements;
		if( sum > SHARED_PER_WARP )
		if( sum > details::CSRAdaptiveKernelParameters< SizeOfValue >::StreamedSharedElementsPerWarp() )
		{
		if( current - start > 0 ) // extra row
		{
		@@ -57,7 +128,7 @@ findLimit( const Index start,
		}
		else
		{ // one long row
		if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
		if( sum <= 2 * details::CSRAdaptiveKernelParameters< SizeOfValue >::MaxAdaptiveElementsPerWarp() ) //MAX_ELEMENTS_PER_WARP_ADAPT )
		type = details::Type::VECTOR;
		else
		type = details::Type::LONG;
		@@ -71,10 +142,11 @@ findLimit( const Index start,

		template< typename Index,
		typename Device >
		template< typename Offsets >
		template< int SizeOfValue,
		typename Offsets >
		void
		CSRAdaptiveKernel< Index, Device >::
		init( const Offsets& offsets )
		initValueSize( const Offsets& offsets )
		{
		using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
		HostOffsetsType hostOffsets( offsets );
		@@ -88,7 +160,7 @@ init( const Offsets& offsets )
		while( nextStart != rows - 1 )
		{
		details::Type type;
		nextStart = findLimit( start, hostOffsets, rows, type, sum );
		nextStart = findLimit< SizeOfValue >( start, hostOffsets, rows, type, sum );

		if( type == details::Type::LONG )
		{
		@@ -110,58 +182,10 @@ init( const Offsets& offsets )
		start = nextStart;
		}
		inBlocks.emplace_back(nextStart);
		this->blocks = inBlocks;
		this->view.setBlocks( blocks );
		}

		template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernel< Index, Device >::
		reset()
		{
		this->blocks.reset();
		this->view.setBlocks( blocks );
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getView() -> ViewType
		{
		return this->view;
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getConstView() const -> ConstViewType
		{
		return this->view;
		};

		template< typename Index,
		typename Device >
		template< typename OffsetsView,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		CSRAdaptiveKernel< Index, Device >::
		segmentsReduction( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		{
		view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		//std::cerr << "Setting blocks to " << std::log2( SizeOfValue ) << std::endl;
		TNL_ASSERT_LT( std::log2( SizeOfValue ), MaxValueSizeLog(), "" );
		TNL_ASSERT_GE( std::log2( SizeOfValue ), 0, "" );
		this->blocksArray[ (int ) std::log2( SizeOfValue ) ] = inBlocks;
		}

		} // namespace Segments

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h

+6 −4

Original line number	Diff line number	Diff line
		@@ -28,11 +28,13 @@ struct CSRAdaptiveKernelView
		using BlocksType = TNL::Containers::Vector< details::CSRAdaptiveKernelBlockDescriptor< Index >, Device, Index >;
		using BlocksView = typename BlocksType::ViewType;

		static constexpr int MaxValueSizeLog = 6;

		CSRAdaptiveKernelView() = default;

		CSRAdaptiveKernelView( BlocksType& blocks );

		void setBlocks( BlocksType& blocks );
		void setBlocks( BlocksType& blocks, const int idx );

		ViewType getView();

		@@ -57,10 +59,10 @@ struct CSRAdaptiveKernelView

		CSRAdaptiveKernelView& operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView );

		void printBlocks() const;
		void printBlocks( int idx ) const;

		protected:
		BlocksView blocks;
		BlocksView blocksArray[ MaxValueSizeLog ];
		};

		} // namespace Segments

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+21 −17

Original line number	Diff line number	Diff line
		@@ -50,10 +50,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		Real zero,
		Args... args )
		{
		static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
		constexpr int WarpSize = Cuda::getWarpSize();
		constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
		constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();
		static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize();
		//constexpr int WarpSize = Cuda::getWarpSize();
		//constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::WarpsCount();
		//constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedElementsPerWarp();


		__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
		@@ -199,21 +199,21 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		}
		#endif

		template< typename Index,
		/*template< typename Index,
		typename Device >
		CSRAdaptiveKernelView< Index, Device >::
		CSRAdaptiveKernelView( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}
		}*/

		template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernelView< Index, Device >::
		setBlocks( BlocksType& blocks )
		setBlocks( BlocksType& blocks, const int idx )
		{
		this->blocks.bind( blocks );
		this->blocksArray[ idx ].bind( blocks );
		}

		template< typename Index,
		@@ -263,23 +263,25 @@ segmentsReduction( const OffsetsView& offsets,
		Args... args ) const
		{
		#ifdef HAVE_CUDA
		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
		int valueSizeLog = std::ceil( log2f( ( double ) sizeof( Real ) ) );

		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() \|\| valueSizeLog > MaxValueSizeLog )
		{
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		return;
		}

		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;
		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();
		//static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp();

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< sizeof( Real ) >::StreamedSharedMemory();

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
		@@ -298,7 +300,7 @@ segmentsReduction( const OffsetsView& offsets,
		constexpr size_t MAX_X_DIM = 2147483647;

		/* Fill blocks */
		size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
		size_t neededThreads = this->blocksArray[ valueSizeLog ].getSize() * warpSize; // one warp per block
		/* Execute kernels on device */
		for (Index gridIdx = 0; neededThreads != 0; gridIdx++ )
		{
		@@ -317,12 +319,12 @@ segmentsReduction( const OffsetsView& offsets,
		warpSize,
		WARPS,
		SHARED_PER_WARP,
		details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
		details::CSRAdaptiveKernelParameters< sizeof( Real ) >::MaxAdaptiveElementsPerWarp(),
		BlocksView,
		OffsetsView,
		Index, Fetch, Reduction, ResultKeeper, Real, Args... >
		<<<blocksCount, threads>>>(
		this->blocks,
		this->blocksArray[ valueSizeLog ],
		gridIdx,
		offsets,
		first,
		@@ -342,7 +344,8 @@ CSRAdaptiveKernelView< Index, Device >&
		CSRAdaptiveKernelView< Index, Device >::
		operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
		{
		this->blocks.bind( kernelView.blocks );
		for( int i = 0; i < MaxValueSizeLog; i++ )
		this->blocksArray[ i ].bind( kernelView.blocksArray[ i ] );
		return *this;
		}

		@@ -350,8 +353,9 @@ template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernelView< Index, Device >::
		printBlocks() const
		printBlocks( int idx ) const
		{
		auto& blocks = this->blocksArray[ idx ];
		for( Index i = 0; i < this->blocks.getSize(); i++ )
		{
		auto block = blocks.getElement( i );

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -15,7 +15,7 @@ namespace TNL {
		namespace Segments {
		namespace details {

		template< typename Value,
		template< int SizeOfValue,
		int StreamedSharedMemory_ = 24576 >
		struct CSRAdaptiveKernelParameters
		{
		@@ -37,7 +37,7 @@ struct CSRAdaptiveKernelParameters
		/**
		* \brief Number of elements fitting into streamed shared memory.
		*/
		static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };
		static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / SizeOfValue; };

		/**
		* \brief Computes number of warps in one CUDA block.