Optimized parallel reduction on CUDA (a2fb0bdf) · Commits · TNL / tnl-dev

src/TNL/Containers/Algorithms/CudaReductionKernel.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -31,7 +31,7 @@ namespace Algorithms {
		*/
		static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2
		#if (__CUDA_ARCH__ >= 300 )
		static constexpr int Reduction_minBlocksPerMultiprocessor = 6;
		static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
		#else
		static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
		#endif

+11 −3

Original line number	Diff line number	Diff line
		@@ -10,6 +10,8 @@

		#ifdef HAVE_CUDA

		#include <unordered_map>

		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Devices/Cuda.h>

		@@ -101,10 +103,16 @@ int
		CudaDeviceInfo::
		getCudaMultiprocessors( int deviceNum )
		{
		// results are cached because they are used for configuration of some kernels
		static std::unordered_map< int, int > results;
		if( results.count( deviceNum ) == 0 ) {
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, deviceNum );
		results.emplace( deviceNum, properties.multiProcessorCount );
		return properties.multiProcessorCount;
		}
		return results[ deviceNum ];
		}

		int
		CudaDeviceInfo::