This project is archived. Its data is read-only.

Commit 045eb5f4 authored Jun 13, 2020 by Jakub Klinkovský

Fixed __launch_bounds__ optimizations for Turing GPUs

parent 1a1a4867

src/TNL/Algorithms/CudaMultireductionKernel.h

+4 −3

Original line number	Diff line number	Diff line
		@@ -32,10 +32,11 @@ static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a powe
		static constexpr int Multireduction_registersPerThread = 32; // empirically determined optimal value

		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
		#else
		#if (__CUDA_ARCH__ == 750 )
		// Turing has a limit of 1024 threads per multiprocessor
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 4;
		#else
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 8;
		#endif

		template< int blockSizeX,

src/TNL/Algorithms/CudaReductionKernel.h

+4 −3

Original line number	Diff line number	Diff line
		@@ -33,10 +33,11 @@ static constexpr int Reduction_registersPerThread = 32; // empirically determi

		#ifdef HAVE_CUDA
		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#else
		#if (__CUDA_ARCH__ == 750 )
		// Turing has a limit of 1024 threads per multiprocessor
		static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
		#else
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#endif

		template< int blockSize,