Refuciktoring of adaptive CSR kernel. (afc75e25) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+11 −133

Original line number	Diff line number	Diff line
		@@ -23,35 +23,6 @@ namespace TNL {
		namespace Algorithms {
		namespace Segments {

		#ifdef HAVE_CUDA

		template< int CudaBlockSize,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP,
		typename BlocksView,
		typename Offsets,
		typename Index,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		__global__ void
		segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		int gridIdx,
		Offsets offsets,
		Index first,
		Index last,
		Fetch fetch,
		Reduction reduce,
		ResultKeeper keep,
		Real zero,
		Args... args );
		#endif


		template< typename Index,
		typename Device >
		struct CSRAdaptiveKernel
		@@ -63,114 +34,24 @@ struct CSRAdaptiveKernel
		using BlocksType = typename ViewType::BlocksType;
		using BlocksView = typename BlocksType::ViewType;

		static TNL::String getKernelType()
		{
		return ViewType::getKernelType();
		};

		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(double);

		/* Number of warps in block for CSR Adaptive */
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;
		static TNL::String getKernelType();

		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

		template< typename Offsets >
		Index findLimit( const Index start,
		const Offsets& offsets,
		const Index size,
		details::Type &type,
		Index &sum )
		{
		sum = 0;
		for (Index current = start; current < size - 1; current++ )
		{
		Index elements = offsets[ current + 1 ] - offsets[ current ];
		sum += elements;
		if( sum > SHARED_PER_WARP )
		{
		if( current - start > 0 ) // extra row
		{
		type = details::Type::STREAM;
		return current;
		}
		else
		{ // one long row
		if( sum <= 2 * MAX_ELEMENTS_PER_WARP_ADAPT )
		type = details::Type::VECTOR;
		else
		type = details::Type::LONG;
		return current + 1;
		}
		}
		}
		type = details::Type::STREAM;
		return size - 1; // return last row pointer
		}
		Index &sum );

		template< typename Offsets >
		void init( const Offsets& offsets )
		{
		using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
		HostOffsetsType hostOffsets( offsets );
		const Index rows = offsets.getSize();
		Index sum, start( 0 ), nextStart( 0 );
		void init( const Offsets& offsets );

		// Fill blocks
		std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
		inBlocks.reserve( rows );
		void reset();

		while( nextStart != rows - 1 )
		{
		details::Type type;
		nextStart = findLimit( start, hostOffsets, rows, type, sum );
		ViewType getView();

		if( type == details::Type::LONG )
		{
		const Index blocksCount = inBlocks.size();
		const Index warpsPerCudaBlock = THREADS_ADAPTIVE / TNL::Cuda::getWarpSize();
		Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
		if( warpsLeft == 0 )
		warpsLeft = warpsPerCudaBlock;
		for( Index index = 0; index < warpsLeft; index++ )
		inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
		}
		else
		{
		inBlocks.emplace_back(start, type,
		nextStart,
		offsets.getElement(nextStart),
		offsets.getElement(start) );
		}
		start = nextStart;
		}
		inBlocks.emplace_back(nextStart);
		this->blocks = inBlocks;
		this->view.setBlocks( blocks );
		}

		void reset()
		{
		this->blocks.reset();
		this->view.setBlocks( blocks );
		}

		ViewType getView() { return this->view; };

		ConstViewType getConstView() const { return this->view; };
		ConstViewType getConstView() const;

		template< typename OffsetsView,
		typename Fetch,
		@@ -185,10 +66,7 @@ struct CSRAdaptiveKernel
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		{
		view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		}
		Args... args ) const;

		protected:
		BlocksType blocks;

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.hpp

+142 −1

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@
		#include <TNL/Containers/VectorView.h>
		#include <TNL/Algorithms/ParallelFor.h>
		#include <TNL/Algorithms/Segments/details/LambdaAdapter.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>
		#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>

		@@ -22,6 +23,146 @@ namespace TNL {
		namespace Algorithms {
		namespace Segments {

		template< typename Index,
		typename Device >
		TNL::String
		CSRAdaptiveKernel< Index, Device >::
		getKernelType()
		{
		return ViewType::getKernelType();
		}

		template< typename Index,
		typename Device >
		template< typename Offsets >
		Index
		CSRAdaptiveKernel< Index, Device >::
		findLimit( const Index start,
		const Offsets& offsets,
		const Index size,
		details::Type &type,
		Index &sum )
		{
		sum = 0;
		for (Index current = start; current < size - 1; current++ )
		{
		Index elements = offsets[ current + 1 ] - offsets[ current ];
		sum += elements;
		if( sum > details::CSRAdaptiveKernelParameters< double >::StreamedSharedElementsPerWarp() )
		{
		if( current - start > 0 ) // extra row
		{
		type = details::Type::STREAM;
		return current;
		}
		else
		{ // one long row
		if( sum <= 2 * details::CSRAdaptiveKernelParameters< double >::MaxAdaptiveElementsPerWarp() )
		type = details::Type::VECTOR;
		else
		type = details::Type::LONG;
		return current + 1;
		}
		}
		}
		type = details::Type::STREAM;
		return size - 1; // return last row pointer
		}

		template< typename Index,
		typename Device >
		template< typename Offsets >
		void
		CSRAdaptiveKernel< Index, Device >::
		init( const Offsets& offsets )
		{
		using HostOffsetsType = TNL::Containers::Vector< typename Offsets::IndexType, TNL::Devices::Host, typename Offsets::IndexType >;
		HostOffsetsType hostOffsets( offsets );
		const Index rows = offsets.getSize();
		Index sum, start( 0 ), nextStart( 0 );

		// Fill blocks
		std::vector< details::CSRAdaptiveKernelBlockDescriptor< Index > > inBlocks;
		inBlocks.reserve( rows );

		while( nextStart != rows - 1 )
		{
		details::Type type;
		nextStart = findLimit( start, hostOffsets, rows, type, sum );

		if( type == details::Type::LONG )
		{
		const Index blocksCount = inBlocks.size();
		const Index warpsPerCudaBlock = details::CSRAdaptiveKernelParameters< double >::CudaBlockSize() / Cuda::getWarpSize();
		Index warpsLeft = roundUpDivision( blocksCount, warpsPerCudaBlock ) * warpsPerCudaBlock - blocksCount;
		if( warpsLeft == 0 )
		warpsLeft = warpsPerCudaBlock;
		for( Index index = 0; index < warpsLeft; index++ )
		inBlocks.emplace_back( start, details::Type::LONG, index, warpsLeft );
		}
		else
		{
		inBlocks.emplace_back(start, type,
		nextStart,
		offsets.getElement(nextStart),
		offsets.getElement(start) );
		}
		start = nextStart;
		}
		inBlocks.emplace_back(nextStart);
		this->blocks = inBlocks;
		this->view.setBlocks( blocks );
		}

		template< typename Index,
		typename Device >
		void
		CSRAdaptiveKernel< Index, Device >::
		reset()
		{
		this->blocks.reset();
		this->view.setBlocks( blocks );
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getView() -> ViewType
		{
		return this->view;
		}

		template< typename Index,
		typename Device >
		auto
		CSRAdaptiveKernel< Index, Device >::
		getConstView() const -> ConstViewType
		{
		return this->view;
		};

		template< typename Index,
		typename Device >
		template< typename OffsetsView,
		typename Fetch,
		typename Reduction,
		typename ResultKeeper,
		typename Real,
		typename... Args >
		void
		CSRAdaptiveKernel< Index, Device >::
		segmentsReduction( const OffsetsView& offsets,
		Index first,
		Index last,
		Fetch& fetch,
		const Reduction& reduction,
		ResultKeeper& keeper,
		const Real& zero,
		Args... args ) const
		{
		view.segmentsReduction( offsets, first, last, fetch, reduction, keeper, zero, args... );
		}

		} // namespace Segments
		} // namespace Algorithms

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -12,6 +12,7 @@

		#include <TNL/Containers/Vector.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

		namespace TNL {
		namespace Algorithms {

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+30 −77

Original line number	Diff line number	Diff line
		@@ -25,12 +25,7 @@ namespace TNL {

		#ifdef HAVE_CUDA

		template< int CudaBlockSize,
		int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP,
		typename BlocksView,
		template< typename BlocksView,
		typename Offsets,
		typename Index,
		typename Fetch,
		@@ -50,10 +45,18 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		Real zero,
		Args... args )
		{
		__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
		__shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
		constexpr size_t MAX_X_DIM = 2147483647;
		const Index index = (gridIdx * MAX_X_DIM) + (blockIdx.x * blockDim.x) + threadIdx.x;
		constexpr size_t CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
		constexpr int WarpSize = Cuda::getWarpSize();
		constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
		constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();

		/////
		// Note, for some large large Value types the following allocation of the shared memory might fail.
		// We might add some check here.
		__shared__ Real streamShared[ WarpsCount ][ StreamedSharedElementsPerWarp ];
		__shared__ Real multivectorShared[ CudaBlockSize / WarpSize ];

		const Index index = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		const Index blockIdx = index / warpSize;
		if( blockIdx >= blocks.getSize() - 1 )
		return;
		@@ -113,32 +116,17 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		}
		else // blockType == Type::LONG - several warps per segment
		{
		// Number of elements processed by previous warps
		//const Index offset = //block.index[1] * MAX_ELEM_PER_WARP;
		/// block.getWarpIdx() * MAX_ELEM_PER_WARP;
		//Index to = begin + (block.getWarpIdx() + 1) * MAX_ELEM_PER_WARP;
		const Index segmentIdx = block.getFirstSegment();//block.index[0];
		//minID = offsets[block.index[0] ];
		const Index segmentIdx = block.getFirstSegment();
		const Index end = offsets[ segmentIdx + 1 ];
		//const int tid = threadIdx.x;
		//const int inBlockWarpIdx = block.getWarpIdx();

		//if( to > end )
		// to = end;
		TNL_ASSERT_GT( block.getWarpsCount(), 0, "" );
		result = zero;
		//printf( "LONG tid %d warpIdx %d: LONG \n", tid, block.getWarpIdx() );
		for( Index globalIdx = begin + laneIdx + TNL::Cuda::getWarpSize() * block.getWarpIdx();
		globalIdx < end;
		globalIdx += TNL::Cuda::getWarpSize() * block.getWarpsCount() )
		{
		result = reduce( result, details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, -1, globalIdx, compute ) );
		//if( laneIdx == 0 )
		// printf( "LONG warpIdx: %d gid: %d begin: %d end: %d -> %d \n", ( int ) block.getWarpIdx(), globalIdx, begin, end,
		// details::FetchLambdaAdapter< Index, Fetch >::call( fetch, segmentIdx, 0, globalIdx, compute ) );
		//result += values[i] * inVector[columnIndexes[i]];
		}
		//printf( "tid %d -> %d \n", tid, result );

		result += __shfl_down_sync( 0xFFFFFFFF, result, 16 );
		result += __shfl_down_sync( 0xFFFFFFFF, result, 8 );
		@@ -146,9 +134,6 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		result += __shfl_down_sync( 0xFFFFFFFF, result, 2 );
		result += __shfl_down_sync( 0xFFFFFFFF, result, 1 );

		//if( laneIdx == 0 )
		// printf( "WARP RESULT: tid %d -> %d \n", tid, result );

		const Index warpID = threadIdx.x / 32;
		if( laneIdx == 0 )
		multivectorShared[ warpID ] = result;
		@@ -157,7 +142,7 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		// Reduction in multivectorShared
		if( block.getWarpIdx() == 0 && laneIdx < 16 )
		{
		constexpr int totalWarps = CudaBlockSize / warpSize;
		constexpr int totalWarps = CudaBlockSize / WarpSize;
		if( totalWarps >= 32 )
		{
		multivectorShared[ laneIdx ] = reduce( multivectorShared[ laneIdx ], multivectorShared[ laneIdx + 16 ] );
		@@ -184,13 +169,10 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		__syncwarp();
		}
		if( laneIdx == 0 )
		{
		//printf( "Long: segmentIdx %d -> %d \n", segmentIdx, multivectorShared[ 0 ] );
		keep( segmentIdx, multivectorShared[ 0 ] );
		}
		}
		}
		}
		#endif

		template< typename Index,
		@@ -264,58 +246,29 @@ segmentsReduction( const OffsetsView& offsets,
		return;
		}

		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		//static constexpr Index THREADS_SCALAR = 128;
		//static constexpr Index THREADS_VECTOR = 128;
		//static constexpr Index THREADS_LIGHT = 128;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;

		/* Max length of row to process one warp for CSR Adaptive */
		static constexpr Index MAX_ELEMENTS_PER_WARP_ADAPT = 512;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 24576;

		/* Number of elements in shared memory */
		static constexpr Index SHARED = SHARED_PER_BLOCK/sizeof(Real);

		/* Number of warps in block for CSR Adaptive */
		static constexpr Index WARPS = THREADS_ADAPTIVE / 32;

		/* Number of elements in shared memory per one warp */
		static constexpr Index SHARED_PER_WARP = SHARED / WARPS;

		constexpr int warpSize = 32;
		//constexpr int warpSize = 32;

		Index blocksCount;

		const Index threads = THREADS_ADAPTIVE;
		constexpr size_t MAX_X_DIM = 2147483647;
		const Index threads = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();

		/* Fill blocks */
		size_t neededThreads = this->blocks.getSize() * warpSize; // one warp per block
		size_t neededThreads = this->blocks.getSize() * TNL::Cuda::getWarpSize(); // one warp per block
		/* Execute kernels on device */
		for( Index gridIdx = 0; neededThreads != 0; gridIdx++ )
		{
		if (MAX_X_DIM * threads >= neededThreads)
		if( Cuda::getMaxGridSize() * threads >= neededThreads )
		{
		blocksCount = roundUpDivision( neededThreads, threads );
		neededThreads = 0;
		}
		else
		{
		blocksCount = MAX_X_DIM;
		neededThreads -= MAX_X_DIM * threads;
		blocksCount = TNL::Cuda::getMaxGridSize();
		neededThreads -= TNL::Cuda::getMaxGridSize() * threads;
		}

		segmentsReductionCSRAdaptiveKernel<
		THREADS_ADAPTIVE,
		warpSize,
		WARPS,
		SHARED_PER_WARP,
		MAX_ELEMENTS_PER_WARP_ADAPT,
		BlocksView,
		OffsetsView,
		Index, Fetch, Reduction, ResultKeeper, Real, Args... >

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

0 → 100644

+70 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRAdaptiveKernelBlockDescriptor.h - description
		-------------------
		begin : Jan 25, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		namespace TNL {
		namespace Algorithms {
		namespace Segments {
		namespace details {

		template< typename Value >
		struct CSRAdaptiveKernelParameters
		{
		/**
		* \brief Computes number of CUDA threads per block depending on Value type.
		*
		* \return CUDA block size.
		*/
		static constexpr int CudaBlockSize() { return std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };

		/**
		* \brief Returns amount of shared memory dedicated for stream CSR kernel.
		*
		* \return Stream shared memory.
		*/
		static constexpr size_t StreamedSharedMemory() { return 40960; };

		/**
		* \brief Number of elements fitting into streamed shared memory.
		*/
		static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };

		/**
		* \brief Computes number of warps in one CUDA block.
		*/
		static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };

		/**
		* \brief Computes number of elements to be streamed into the shared memory.
		*
		* \return Number of elements to be streamed into the shared memory.
		*/
		static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };

		/**
		* \brief Returns maximum number of elements per warp for vector and hybrid kernel.
		*
		* \return Maximum number of elements per warp for vector and hybrid kernel.
		*/
		static constexpr int MaxVectorElementsPerWarp() { return 384; };

		/**
		* \brief Returns maximum number of elements per warp for adaptive kernel.
		*
		* \return Maximum number of elements per warp for adaptive kernel.
		*/
		static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
		};

		} // namespace details
		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL