CudaScanKernelLauncher: configuration of blockSize depending on the ValueType (57d66051) · Commits · TNL / tnl-dev

src/TNL/Algorithms/detail/CudaScanKernel.h

+7 −2

Original line number	Diff line number	Diff line
		@@ -436,7 +436,10 @@ CudaScanKernelUniformShift( OutputView output,
		*/
		template< ScanType scanType,
		ScanPhaseType phaseType,
		int blockSize = 256,
		typename ValueType,
		// use blockSize=256 for 32-bit value types, scale with sizeof(ValueType)
		// to keep shared memory requirements constant
		int blockSize = 256 * 4 / sizeof(ValueType),
		// valuesPerThread should be odd to avoid shared memory bank conflicts
		int valuesPerThread = 7 >
		struct CudaScanKernelLauncher
		@@ -520,6 +523,7 @@ struct CudaScanKernelLauncher
		Reduction&& reduction,
		typename OutputArray::ValueType zero )
		{
		static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
		using Index = typename InputArray::IndexType;

		if( end - begin <= blockSize * valuesPerThread ) {
		@@ -639,7 +643,7 @@ struct CudaScanKernelLauncher

		// blockResults now contains scan results for each block. The first phase
		// ends by computing an exclusive scan of this array.
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::perform(
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::perform(
		blockResults,
		blockResults,
		0,
		@@ -689,6 +693,7 @@ struct CudaScanKernelLauncher
		typename OutputArray::ValueType zero,
		typename OutputArray::ValueType shift )
		{
		static_assert( std::is_same< ValueType, typename OutputArray::ValueType >::value, "invalid configuration of ValueType" );
		using Index = typename InputArray::IndexType;

		// if the input was already scanned with just one block in the first phase,

src/TNL/Algorithms/detail/Scan.hpp

+3 −3

Original line number	Diff line number	Diff line
		@@ -278,7 +278,7 @@ perform( const InputArray& input,
		if( end <= begin )
		return;

		detail::CudaScanKernelLauncher< Type, PhaseType >::perform(
		detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::perform(
		input,
		output,
		begin,
		@@ -312,7 +312,7 @@ performFirstPhase( const InputArray& input,
		return block_results;
		}

		return detail::CudaScanKernelLauncher< Type, PhaseType >::performFirstPhase(
		return detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performFirstPhase(
		input,
		output,
		begin,
		@@ -346,7 +346,7 @@ performSecondPhase( const InputArray& input,
		if( end <= begin )
		return;

		detail::CudaScanKernelLauncher< Type, PhaseType >::performSecondPhase(
		detail::CudaScanKernelLauncher< Type, PhaseType, typename OutputArray::ValueType >::performSecondPhase(
		input,
		output,
		blockShifts,

src/UnitTests/Algorithms/distributedScanTest.h

+10 −10

Original line number	Diff line number	Diff line
		@@ -87,14 +87,14 @@ protected:
		#ifdef HAVE_CUDA
		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
		}
		#endif
		}
		@@ -106,8 +106,8 @@ protected:
		// skip the check for too small arrays
		if( check_cuda_grids && array.getLocalRange().getSize() > 256 ) {
		// we don't care which kernel launcher was actually used
		const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
		CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
		const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
		CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
		EXPECT_GT( gridsCount, 1 );
		}
		#endif

src/UnitTests/Algorithms/scanTest.h

+10 −10

Original line number	Diff line number	Diff line
		@@ -59,14 +59,14 @@ protected:
		#ifdef HAVE_CUDA
		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInFirstPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Inclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::resetMaxGridSize();
		CudaScanKernelLauncher< ScanType::Exclusive, ScanPhaseType::WriteInSecondPhase, ValueType >::maxGridSize() = 3;
		}
		#endif
		}
		@@ -78,8 +78,8 @@ protected:
		// skip the check for too small arrays
		if( array.getSize() > 256 ) {
		// we don't care which kernel launcher was actually used
		const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase >::gridsCount(),
		CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase >::gridsCount() );
		const auto gridsCount = TNL::max( CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInFirstPhase, ValueType >::gridsCount(),
		CudaScanKernelLauncher< ScanType, ScanPhaseType::WriteInSecondPhase, ValueType >::gridsCount() );
		EXPECT_GT( gridsCount, 1 );
		}
		#endif