Commit a2fb0bdf authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Optimized parallel reduction on CUDA

parent 592f6355
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -31,7 +31,7 @@ namespace Algorithms {
 */
static constexpr int Reduction_maxThreadsPerBlock = 256;  // must be a power of 2
#if (__CUDA_ARCH__ >= 300 )
   static constexpr int Reduction_minBlocksPerMultiprocessor = 6;
   static constexpr int Reduction_minBlocksPerMultiprocessor = 8;
#else
   static constexpr int Reduction_minBlocksPerMultiprocessor = 4;
#endif
+11 −3
Original line number Diff line number Diff line
@@ -10,6 +10,8 @@

#ifdef HAVE_CUDA

#include <unordered_map>

#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Devices/Cuda.h>

@@ -101,10 +103,16 @@ int
CudaDeviceInfo::
getCudaMultiprocessors( int deviceNum )
{
    // results are cached because they are used for configuration of some kernels
    static std::unordered_map< int, int > results;
    if( results.count( deviceNum ) == 0 ) {
        cudaDeviceProp properties;
        cudaGetDeviceProperties( &properties, deviceNum );
        results.emplace( deviceNum, properties.multiProcessorCount );
        return properties.multiProcessorCount;
    }
    return results[ deviceNum ];
}

int
CudaDeviceInfo::