Added CSRAdaptiveKernelParameters. (25e55913) · Commits · TNL / tnl-dev

src/TNL/Algorithms/Segments/CSRAdaptiveKernel.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -65,7 +65,7 @@ struct CSRAdaptiveKernel

		static TNL::String getKernelType();

		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Index >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

		/* How many shared memory use per block in CSR Adaptive kernel */
		static constexpr Index SHARED_PER_BLOCK = 20000; //24576; TODO:

src/TNL/Algorithms/Segments/CSRAdaptiveKernelView.hpp

+9 −7

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@
		#include <TNL/Algorithms/Segments/CSRScalarKernel.h>
		#include <TNL/Algorithms/Segments/CSRAdaptiveKernelView.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelBlockDescriptor.h>
		#include <TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h>

		namespace TNL {
		namespace Algorithms {
		@@ -25,8 +26,7 @@ namespace TNL {

		#ifdef HAVE_CUDA

		template< int CudaBlockSize,
		int warpSize,
		template< int warpSize,
		int WARPS,
		int SHARED_PER_WARP,
		int MAX_ELEM_PER_WARP,
		@@ -50,6 +50,12 @@ segmentsReductionCSRAdaptiveKernel( BlocksView blocks,
		Real zero,
		Args... args )
		{
		static constexpr int CudaBlockSize = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize();
		constexpr int WarpSize = Cuda::getWarpSize();
		constexpr int WarpsCount = details::CSRAdaptiveKernelParameters< Real >::WarpsCount();
		constexpr size_t StreamedSharedElementsPerWarp = details::CSRAdaptiveKernelParameters< Real >::StreamedSharedElementsPerWarp();


		__shared__ Real streamShared[ WARPS ][ SHARED_PER_WARP ];
		__shared__ Real multivectorShared[ CudaBlockSize / warpSize ];
		constexpr size_t MAX_X_DIM = 2147483647;
		@@ -264,10 +270,7 @@ segmentsReduction( const OffsetsView& offsets,
		return;
		}

		static constexpr Index THREADS_ADAPTIVE = sizeof(Index) == 8 ? 128 : 256;
		//static constexpr Index THREADS_SCALAR = 128;
		//static constexpr Index THREADS_VECTOR = 128;
		//static constexpr Index THREADS_LIGHT = 128;
		static constexpr Index THREADS_ADAPTIVE = details::CSRAdaptiveKernelParameters< Real >::CudaBlockSize(); //sizeof(Index) == 8 ? 128 : 256;

		/* Max length of row to process one warp for CSR Light, MultiVector */
		//static constexpr Index MAX_ELEMENTS_PER_WARP = 384;
		@@ -311,7 +314,6 @@ segmentsReduction( const OffsetsView& offsets,
		}

		segmentsReductionCSRAdaptiveKernel<
		THREADS_ADAPTIVE,
		warpSize,
		WARPS,
		SHARED_PER_WARP,

src/TNL/Algorithms/Segments/details/CSRAdaptiveKernelParameters.h

0 → 100644

+72 −0

Original line number	Diff line number	Diff line
		/***************************************************************************
		CSRAdaptiveKernelBlockDescriptor.h - description
		-------------------
		begin : Jan 25, 2021 -> Joe Biden inauguration
		copyright : (C) 2021 by Tomas Oberhuber
		email : tomas.oberhuber@fjfi.cvut.cz
		***************************************************************************/

		/* See Copyright Notice in tnl/Copyright */

		#pragma once

		namespace TNL {
		namespace Algorithms {
		namespace Segments {
		namespace details {

		template< typename Value >
		struct CSRAdaptiveKernelParameters
		{
		static const int StreamedSharedMemory_ = 20000;
		/**
		* \brief Computes number of CUDA threads per block depending on Value type.
		*
		* \return CUDA block size.
		*/
		static constexpr int CudaBlockSize() { return 256; }; //sizeof( Value ) == 8 ? 128 : 256; };
		//std::max( ( int ) ( 1024 / sizeof( Value ) ), ( int ) Cuda::getWarpSize() ); };

		/**
		* \brief Returns amount of shared memory dedicated for stream CSR kernel.
		*
		* \return Stream shared memory.
		*/
		static constexpr size_t StreamedSharedMemory() { return StreamedSharedMemory_; };

		/**
		* \brief Number of elements fitting into streamed shared memory.
		*/
		static constexpr size_t StreamedSharedElementsCount() { return StreamedSharedMemory() / sizeof( Value ); };

		/**
		* \brief Computes number of warps in one CUDA block.
		*/
		static constexpr size_t WarpsCount() { return CudaBlockSize() / Cuda::getWarpSize(); };

		/**
		* \brief Computes number of elements to be streamed into the shared memory.
		*
		* \return Number of elements to be streamed into the shared memory.
		*/
		static constexpr size_t StreamedSharedElementsPerWarp() { return StreamedSharedElementsCount() / WarpsCount(); };

		/**
		* \brief Returns maximum number of elements per warp for vector and hybrid kernel.
		*
		* \return Maximum number of elements per warp for vector and hybrid kernel.
		*/
		static constexpr int MaxVectorElementsPerWarp() { return 384; };

		/**
		* \brief Returns maximum number of elements per warp for adaptive kernel.
		*
		* \return Maximum number of elements per warp for adaptive kernel.
		*/
		static constexpr int MaxAdaptiveElementsPerWarp() { return 512; };
		};

		} // namespace details
		} // namespace Segments
		} // namespace Algorithms
		} // namespace TNL