Loading src/TNL/Containers/Algorithms/CudaReductionKernel.h +1 −1 Original line number Diff line number Diff line Loading @@ -31,7 +31,7 @@ namespace Algorithms { */ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 6; static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #else static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif Loading src/TNL/Devices/CudaDeviceInfo.cu +11 −3 Original line number Diff line number Diff line Loading @@ -10,6 +10,8 @@ #ifdef HAVE_CUDA #include <unordered_map> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Devices/Cuda.h> Loading Loading @@ -101,10 +103,16 @@ int CudaDeviceInfo:: getCudaMultiprocessors( int deviceNum ) { // results are cached because they are used for configuration of some kernels static std::unordered_map< int, int > results; if( results.count( deviceNum ) == 0 ) { cudaDeviceProp properties; cudaGetDeviceProperties( &properties, deviceNum ); results.emplace( deviceNum, properties.multiProcessorCount ); return properties.multiProcessorCount; } return results[ deviceNum ]; } int CudaDeviceInfo:: Loading Loading
src/TNL/Containers/Algorithms/CudaReductionKernel.h +1 −1 Original line number Diff line number Diff line Loading @@ -31,7 +31,7 @@ namespace Algorithms { */ static constexpr int Reduction_maxThreadsPerBlock = 256; // must be a power of 2 #if (__CUDA_ARCH__ >= 300 ) static constexpr int Reduction_minBlocksPerMultiprocessor = 6; static constexpr int Reduction_minBlocksPerMultiprocessor = 8; #else static constexpr int Reduction_minBlocksPerMultiprocessor = 4; #endif Loading
src/TNL/Devices/CudaDeviceInfo.cu +11 −3 Original line number Diff line number Diff line Loading @@ -10,6 +10,8 @@ #ifdef HAVE_CUDA #include <unordered_map> #include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Devices/Cuda.h> Loading Loading @@ -101,10 +103,16 @@ int CudaDeviceInfo:: getCudaMultiprocessors( int deviceNum ) { // results are cached because they are used for configuration of some kernels static std::unordered_map< int, int > results; if( results.count( deviceNum ) == 0 ) { cudaDeviceProp properties; cudaGetDeviceProperties( &properties, deviceNum ); results.emplace( deviceNum, properties.multiProcessorCount ); return properties.multiProcessorCount; } return results[ deviceNum ]; } int CudaDeviceInfo:: Loading