Fixed configuration of reduction kernels (71f6c6d0) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaMultireductionKernel.h

+11 −6

Original line number	Diff line number	Diff line
		@@ -32,6 +32,9 @@ namespace Algorithms {
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Multireduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Multireduction_registersPerThread = 38; // empirically determined optimal value

		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Multireduction_minBlocksPerMultiprocessor = 6;
		#else
		@@ -187,12 +190,14 @@ CudaMultireductionKernelLauncher( Operation& operation,
		// we run the kernel with a fixed number of blocks, so the amount of work per
		// block increases with enlarging the problem, so even small imbalance can
		// cost us dearly.
		// On Tesla K40c, desGridSizeX = 4 * 6 * 15 = 360.
		// const IndexType desGridSizeX = 4 * Multireduction_minBlocksPerMultiprocessor
		// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// On Tesla K40c, desGridSizeX = 6 * 15 = 90.
		const IndexType desGridSizeX = Multireduction_minBlocksPerMultiprocessor
		* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// Therefore, desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
		// where blocksPerMultiprocessor is determined according to the number of
		// available registers on the multiprocessor.
		// On Tesla K40c, desGridSize = 8 * 15 = 120.
		const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
		const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
		/ ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
		const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
		dim3 blockSize, gridSize;

		// version A: max 16 rows of threads

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+13 −7

Original line number	Diff line number	Diff line
		@@ -30,6 +30,9 @@ namespace Algorithms {
		* architecture so that there are no local memory spills.
		*/
		static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
		static constexpr int Reduction_registersPerThread = 32; // empirically determined optimal value

		// __CUDA_ARCH__ is defined only in device code!
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#else
		@@ -189,13 +192,16 @@ CudaReductionKernelLauncher( Operation& operation,
		// we run the kernel with a fixed number of blocks, so the amount of work per
		// block increases with enlarging the problem, so even small imbalance can
		// cost us dearly.
		// On Tesla K40c, desGridSize = 4 * 6 * 15 = 360.
		// const IndexType desGridSize = 4 * Reduction_minBlocksPerMultiprocessor
		// * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		// On Tesla K40c, desGridSize = 6 * 15 = 90.
		const IndexType desGridSize = Reduction_minBlocksPerMultiprocessor
		* Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() );
		dim3 blockSize( 256 ), gridSize( 0 );
		// Therefore, desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors
		// where blocksPerMultiprocessor is determined according to the number of
		// available registers on the multiprocessor.
		// On Tesla K40c, desGridSize = 8 * 15 = 120.
		const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
		const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
		/ ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread );
		const int desGridSize = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
		dim3 blockSize, gridSize;
		blockSize.x = Reduction_maxThreadsPerBlock;
		gridSize.x = min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );

		// create reference to the reduction buffer singleton and set size

src/TNL/Devices/CudaDeviceInfo.cpp

+7 −0

Original line number	Diff line number	Diff line
		@@ -107,6 +107,13 @@ getCudaCores( int deviceNum )
		return 0;
		}

		int
		CudaDeviceInfo::
		getRegistersPerMultiprocessor( int deviceNum )
		{
		return 0;
		}

		void
		CudaDeviceInfo::
		writeDeviceInfo( Logger& logger )

src/TNL/Devices/CudaDeviceInfo.cu

+15 −0

Original line number	Diff line number	Diff line
		@@ -168,6 +168,21 @@ getCudaCores( int deviceNum )
		CudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum );
		}

		int
		CudaDeviceInfo::
		getRegistersPerMultiprocessor( int deviceNum )
		{
		// results are cached because they are used for configuration of some kernels
		static std::unordered_map< int, int > results;
		if( results.count( deviceNum ) == 0 ) {
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, deviceNum );
		results.emplace( deviceNum, properties.regsPerMultiprocessor );
		return properties.regsPerMultiprocessor;
		}
		return results[ deviceNum ];
		}

		void
		CudaDeviceInfo::
		writeDeviceInfo( Logger& logger )

src/TNL/Devices/CudaDeviceInfo.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -50,6 +50,8 @@ class CudaDeviceInfo

		static int getCudaCores( int deviceNum );

		static int getRegistersPerMultiprocessor( int deviceNum );

		static void writeDeviceInfo( Logger& logger );
		};