Refactoring CSR Adaptive kernel. (1da0935b) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+9 −8

Original line number	Diff line number	Diff line
		@@ -65,25 +65,26 @@ struct CSRAdaptiveKernel

		static TNL::String getKernelType();


		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:
		// How many shared memory use per block in CSR Adaptive kernel
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Index >::StreamedSharedMemory(); //20000; //24576; TODO:

		/* Number of elements in shared memory */
		// Number of elements in shared memory
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);

		/* Number of warps in block for CSR Adaptive */
		// Number of warps in block for CSR Adaptive
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

		/* Number of elements in shared memory per one warp */
		// Number of elements in shared memory per one warp
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		// Max length of row to process one warp for CSR Light, MultiVector
		static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
		// Max length of row to process one warp for CSR Adaptive
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Index >::MaxAdaptiveElementsPerWarp();

		template< typename Offsets >
		Index findLimit( const Index start,

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+3 −3

Original line number	Diff line number	Diff line
		@@ -276,10 +276,10 @@ segmentsReduction( const OffsetsView& offsets,
		//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp();

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 24576;
		static constexpr Index SHARED_PER_BLOCK = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedMemory();

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);
		@@ -317,7 +317,7 @@ segmentsReduction( const OffsetsView& offsets,
		warpSize,
		WARPS,
		SHARED_PER_WARP,
		MAX_ELEMENTS_PER_WARP_ADAPT,
		details::CSRAdaptiveKernelParameters< Real >::MaxAdaptiveElementsPerWarp(),
		BlocksView,
		OffsetsView,
		Index, Fetch, Reduction, ResultKeeper, Real, Args... >

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

+2 −2

Original line number	Diff line number	Diff line
		@@ -15,10 +15,10 @@ namespace TNL {
		namespace Segments {
		namespace details {

		template< typename Value >
		template< typename Value,
		int StreamedSharedMemory_ = 24576 >
		struct CSRAdaptiveKernelParameters
		{
		static const int StreamedSharedMemory_ = 20000;
		/**
		* \brief Computes number of CUDA threads per block depending on Value type.
		*