Added Adaptove CSR kernel dispatcher. (f389535b) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+145 −68

Original line number	Diff line number	Diff line
		@@ -176,85 +176,73 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		#endif

		template< typename Index,
		typename Device >
		CSRAdaptiveKernelView< Index, Device >::
		CSRAdaptiveKernelView( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}

		template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernelView< Index, Device >::
		setBlocks( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}
		typename Device,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		int StreamedMemory,
		bool DispatchScalarCSR =
		details::CheckFetchLambda< Index, Fetch >::hasAllParameters() \|\|
		std::is_same< Device, Devices::Host >::value >
		struct CSRAdaptiveKernelSegmentsReductionDispatcher;

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernelView< Index, Device >::
		getView() -> ViewType
		typename Device,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		int StreamedMemory >
		struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, true >
		{
		return *this;
		};

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernelView< Index, Device >::
		getConstView() const -> ConstViewType
		{
		return *this;
		}

		template< typename Index,
		typename Device >
		TNL::String
		CSRAdaptiveKernelView< Index, Device >::
		getKernelType()
		template< typename BlocksView,
		typename Offsets,
		typename Real,
		typename... Args >
		static void reduce( const Offsets& offsets,
		const BlocksView& blocks,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args)
		{
		return "Adaptive";
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		}
		};

		template< typename Index,
		typename Device >
		template< typename OffsetsView,
		typename Device,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		int StreamedMemory >
		struct CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory, false >
		{
		template< typename BlocksView,
		typename Offsets,
		typename Real,
		typename... Args >
		void
		CSRAdaptiveKernelView< Index, Device >::
		segmentsReduction( const OffsetsView& offsets,
		static void reduce( const Offsets& offsets,
		const BlocksView& blocks,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		Args... args)
		{
		#ifdef HAVE_CUDA
		if( details::CheckFetchLambda< Index, Fetch >::hasAllParameters() )
		{
		TNL::Algorithms::Segments::CSRScalarKernel< Index, Device >::
		segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		return;
		}

		//constexpr int warpSize = 32;

		Index blocksCount;
		const Index threads = details::CSRAdaptiveKernelParameters< Real, StreamedMemory >::CudaBlockSize();

		const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();

		/* Fill blocks */
		size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
		/* Execute kernels on device */
		// Fill blocks
		size_t neededThreads = blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
		// Execute kernels on device
		for( Index gridIdx = 0; neededThreads != 0; gridIdx++ )
		{
		if( Cuda::getMaxGridSize() * threads >= neededThreads )
		@@ -269,11 +257,12 @@ segmentsReduction( const OffsetsView& offsets,
		}

		segmentsReductionCSRAdaptiveKernel<
		StreamedMemory,
		BlocksView,
		OffsetsView,
		Offsets,
		Index, Fetch, Reduction, ResultKeeper, Real, Args... >
		<<<blocksCount, threads>>>(
		this->blocks,
		blocks,
		gridIdx,
		offsets,
		first,
		@@ -286,11 +275,99 @@ segmentsReduction( const OffsetsView& offsets,
		}
		#endif
		}
		};



		template< typename Index,
		typename Device >
		CSRAdaptiveKernelView< Index, Device >&
		CSRAdaptiveKernelView< Index, Device >::
		typename Device,
		int StreamedMemory >
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		CSRAdaptiveKernelView( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		void
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		setBlocks( BlocksType& blocks )
		{
		this->blocks.bind( blocks );
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		auto
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		getBlocks() const -> const BlocksView&
		{
		return this->blocks;
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		auto
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		getView() -> ViewType
		{
		return *this;
		};

		template< typename Index,
		typename Device,
		int StreamedMemory >
		auto
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		getConstView() const -> ConstViewType
		{
		return *this;
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		TNL::String
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		getKernelType()
		{
		return "Adaptive";
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		template< typename Offsets,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		segmentsReduction( const Offsets& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		{

		CSRAdaptiveKernelSegmentsReductionDispatcher< Index, Device, Fetch, Reduction, ResultKeeper, StreamedMemory >::template
		reduce< BlocksView, Offsets, Real, Args... >( offsets, this->getBlocks(), first, last, fetch, reduction, keeper, zero, args... );
		}

		template< typename Index,
		typename Device,
		int StreamedMemory >
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >&
		CSRAdaptiveKernelView< Index, Device, StreamedMemory >::
		operator=( const CSRAdaptiveKernelView< Index, Device >& kernelView )
		{
		this->blocks.bind( kernelView.blocks );