Commit 07d4cb49 authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Moved everything from Cuda.cpp and Cuda.cu into Cuda_impl.h

Having 4 different files (Cuda.h, Cuda_impl.h, Cuda.cpp, Cuda.cu) for
one class is not clear at all. On a related note, a class with 28
methods is clearly too long to be considered as well-designed, so it
should be split into several subclasses. Obviously, the full CUDA
support cannot be provided/wrapped by a single class so the design
should be more flexible.
parent 69488aac
Loading
Loading
Loading
Loading
+0 −2
Original line number Diff line number Diff line
@@ -8,7 +8,6 @@ set (headers Cuda.h

SET( CURRENT_DIR ${CMAKE_SOURCE_DIR}/src/TNL/Devices )
set( common_SOURCES
     ${CURRENT_DIR}/Cuda.cpp
     ${CURRENT_DIR}/Host.cpp 
     ${CURRENT_DIR}/MIC.cpp
     ${CURRENT_DIR}/SystemInfo.cpp )
@@ -16,7 +15,6 @@ set( common_SOURCES
IF( BUILD_CUDA )
   set( tnl_devices_CUDA__SOURCES
        ${common_SOURCES} 
        ${CURRENT_DIR}/Cuda.cu
        ${CURRENT_DIR}/CudaDeviceInfo.cu
        PARENT_SCOPE )
ENDIF()    

src/TNL/Devices/Cuda.cpp

deleted100644 → 0
+0 −64
Original line number Diff line number Diff line
/***************************************************************************
                          Cuda.cpp  -  description
                             -------------------
    begin                : Jul 11, 2013
    copyright            : (C) 2013 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#include <TNL/Devices/Cuda.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>

namespace TNL {
namespace Devices {

SmartPointersRegister Cuda::smartPointersRegister;
Timer Cuda::smartPointersSynchronizationTimer;

String Cuda::getDeviceType()
{
   return String( "Devices::Cuda" );
}

int Cuda::getNumberOfBlocks( const int threads,
                             const int blockSize )
{
   return roundUpDivision( threads, blockSize );
}

int Cuda::getNumberOfGrids( const int blocks,
                            const int gridSize )
{
   return roundUpDivision( blocks, gridSize );
}

void Cuda::insertSmartPointer( SmartPointer* pointer )
{
   smartPointersRegister.insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
}

void Cuda::removeSmartPointer( SmartPointer* pointer )
{
   smartPointersRegister.remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
}

bool Cuda::synchronizeDevice( int deviceId )
{
#ifdef HAVE_CUDA_UNIFIED_MEMORY
   return true;
#else
   if( deviceId < 0 )
      deviceId = Devices::CudaDeviceInfo::getActiveDevice();
   smartPointersSynchronizationTimer.start();
   bool b = smartPointersRegister.synchronizeDevice( deviceId );
   smartPointersSynchronizationTimer.stop();
   return b;
#endif
}

} // namespace Devices
} // namespace TNL

src/TNL/Devices/Cuda.cu

deleted100644 → 0
+0 −119
Original line number Diff line number Diff line
/***************************************************************************
                          Cuda.cu  -  description
                             -------------------
    begin                : Dec 22, 2014
    copyright            : (C) 2014 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaRuntimeError.h>
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Config/ParameterContainer.h>

namespace TNL {
namespace Devices {


void Cuda::setupThreads( const dim3& blockSize,
                         dim3& blocksCount,
                         dim3& gridsCount,
                         long long int xThreads,
                         long long int yThreads,
                         long long int zThreads )
{
   blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
   blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
   blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );
   
   /****
    * TODO: Fix the following:
    * I do not known how to get max grid size in kernels :(
    * 
    * Also, this is very slow. */
   /*int currentDevice( 0 );
   cudaGetDevice( currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );
   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
   */
   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
}

void Cuda::setupGrid( const dim3& blocksCount,
                      const dim3& gridsCount,
                      const dim3& gridIdx,
                      dim3& gridSize )
{
   /* TODO: this is extremely slow!!!!
   int currentDevice( 0 );
   cudaGetDevice( &currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );*/
 
   /****
    * TODO: fix the following
   if( gridIdx.x < gridsCount.x )
      gridSize.x = properties.maxGridSize[ 0 ];
   else
      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
   
   if( gridIdx.y < gridsCount.y )
      gridSize.y = properties.maxGridSize[ 1 ];
   else
      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

   if( gridIdx.z < gridsCount.z )
      gridSize.z = properties.maxGridSize[ 2 ];
   else
      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
   
   if( gridIdx.x < gridsCount.x - 1 )
      gridSize.x = getMaxGridSize();
   else
      gridSize.x = blocksCount.x % getMaxGridSize();
   
   if( gridIdx.y < gridsCount.y - 1 )
      gridSize.y = getMaxGridSize();
   else
      gridSize.y = blocksCount.y % getMaxGridSize();

   if( gridIdx.z < gridsCount.z - 1 )
      gridSize.z = getMaxGridSize();
   else
      gridSize.z = blocksCount.z % getMaxGridSize();
}

void Cuda::printThreadsSetup( const dim3& blockSize,
                              const dim3& blocksCount,
                              const dim3& gridSize,
                              const dim3& gridsCount,
                              std::ostream& str )
{
   str << "Block size: " << blockSize << std::endl
       << " Blocks count: " << blocksCount << std::endl
       << " Grid size: " << gridSize << std::endl
       << " Grids count: " << gridsCount << std::endl;
}


void Cuda::checkDevice( const char* file_name, int line, cudaError error )
{
   if( error != cudaSuccess )
      throw Exceptions::CudaRuntimeError( error, file_name, line );
}

std::ostream& operator << ( std::ostream& str, const dim3& d )
{
   str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
   return str;
}

} // namespace Devices
} // namespace TNL
+26 −24
Original line number Diff line number Diff line
@@ -31,7 +31,12 @@ class Cuda
{
   public:

   static String getDeviceType();
   static inline String getDeviceType();

   static inline void configSetup( Config::ConfigDescription& config, const String& prefix = "" );

   static inline bool setup( const Config::ParameterContainer& parameters,
                             const String& prefix = "" );

   __cuda_callable__ static inline constexpr int getMaxGridSize();

@@ -66,7 +71,7 @@ class Cuda
    * number of the CUDA threads and the block size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfBlocks( const int threads,
   static inline int getNumberOfBlocks( const int threads,
                                        const int blockSize );

   /****
@@ -74,7 +79,7 @@ class Cuda
    * number of the CUDA blocks and maximum grid size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfGrids( const int blocks,
   static inline int getNumberOfGrids( const int blocks,
                                       const int gridSize = getMaxGridSize() );
   
#ifdef HAVE_CUDA   
@@ -153,29 +158,24 @@ class Cuda
    * of calling cudaGetLastError() inside the method.
    * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow.
    */
   static void checkDevice( const char* file_name, int line, cudaError error );
   static inline void checkDevice( const char* file_name, int line, cudaError error );
#else
   static void checkDevice() {}
   static inline void checkDevice() {}
#endif

   static void configSetup( Config::ConfigDescription& config, const String& prefix = "" );
      
   static bool setup( const Config::ParameterContainer& parameters,
                      const String& prefix = "" );
   
   static void insertSmartPointer( SmartPointer* pointer );
   static inline void insertSmartPointer( SmartPointer* pointer );

   static void removeSmartPointer( SmartPointer* pointer );
   static inline void removeSmartPointer( SmartPointer* pointer );

   // Negative deviceId means that CudaDeviceInfo::getActiveDevice will be
   // called to get the device ID.
   static bool synchronizeDevice( int deviceId = -1 );
   static inline bool synchronizeDevice( int deviceId = -1 );

   static Timer smartPointersSynchronizationTimer;
   static inline Timer& getSmartPointersSynchronizationTimer();

   protected:

   static SmartPointersRegister smartPointersRegister;
   static inline SmartPointersRegister& getSmartPointersRegister();
};

#ifdef HAVE_CUDA
@@ -185,7 +185,9 @@ class Cuda
#endif

#ifdef HAVE_CUDA
namespace {
   std::ostream& operator << ( std::ostream& str, const dim3& d );
}
#endif

#ifdef HAVE_CUDA
+178 −17
Original line number Diff line number Diff line
@@ -10,9 +10,12 @@

#pragma once

#include <TNL/Math.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Exceptions/CudaRuntimeError.h>
#include <TNL/CudaSharedMemory.h>
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Config/ParameterContainer.h>
@@ -20,6 +23,39 @@
namespace TNL {
namespace Devices {

inline String Cuda::getDeviceType()
{
   return String( "Cuda" );
}

inline void
Cuda::configSetup( Config::ConfigDescription& config,
                   const String& prefix )
{
#ifdef HAVE_CUDA
   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation.", 0 );
#else
   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation (not supported on this system).", 0 );
#endif
}

inline bool
Cuda::setup( const Config::ParameterContainer& parameters,
             const String& prefix )
{
#ifdef HAVE_CUDA
   int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
   if( cudaSetDevice( cudaDevice ) != cudaSuccess )
   {
      std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
      return false;
   }
   getSmartPointersSynchronizationTimer().reset();
   getSmartPointersSynchronizationTimer().stop();
#endif
   return true;
}

__cuda_callable__
inline constexpr int Cuda::getMaxGridSize()
{
@@ -71,6 +107,105 @@ __device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
}
#endif

inline int Cuda::getNumberOfBlocks( const int threads,
                                    const int blockSize )
{
   return roundUpDivision( threads, blockSize );
}

inline int Cuda::getNumberOfGrids( const int blocks,
                                   const int gridSize )
{
   return roundUpDivision( blocks, gridSize );
}

#ifdef HAVE_CUDA
inline void Cuda::setupThreads( const dim3& blockSize,
                                dim3& blocksCount,
                                dim3& gridsCount,
                                long long int xThreads,
                                long long int yThreads,
                                long long int zThreads )
{
   blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
   blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
   blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );
   
   /****
    * TODO: Fix the following:
    * I do not known how to get max grid size in kernels :(
    * 
    * Also, this is very slow. */
   /*int currentDevice( 0 );
   cudaGetDevice( currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );
   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
   */
   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
}

inline void Cuda::setupGrid( const dim3& blocksCount,
                             const dim3& gridsCount,
                             const dim3& gridIdx,
                             dim3& gridSize )
{
   /* TODO: this is extremely slow!!!!
   int currentDevice( 0 );
   cudaGetDevice( &currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );*/
 
   /****
    * TODO: fix the following
   if( gridIdx.x < gridsCount.x )
      gridSize.x = properties.maxGridSize[ 0 ];
   else
      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
   
   if( gridIdx.y < gridsCount.y )
      gridSize.y = properties.maxGridSize[ 1 ];
   else
      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

   if( gridIdx.z < gridsCount.z )
      gridSize.z = properties.maxGridSize[ 2 ];
   else
      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
   
   if( gridIdx.x < gridsCount.x - 1 )
      gridSize.x = getMaxGridSize();
   else
      gridSize.x = blocksCount.x % getMaxGridSize();
   
   if( gridIdx.y < gridsCount.y - 1 )
      gridSize.y = getMaxGridSize();
   else
      gridSize.y = blocksCount.y % getMaxGridSize();

   if( gridIdx.z < gridsCount.z - 1 )
      gridSize.z = getMaxGridSize();
   else
      gridSize.z = blocksCount.z % getMaxGridSize();
}

inline void Cuda::printThreadsSetup( const dim3& blockSize,
                                     const dim3& blocksCount,
                                     const dim3& gridSize,
                                     const dim3& gridsCount,
                                     std::ostream& str )
{
   str << "Block size: " << blockSize << std::endl
       << " Blocks count: " << blocksCount << std::endl
       << " Grid size: " << gridSize << std::endl
       << " Grids count: " << gridsCount << std::endl;
}
#endif


template< typename ObjectType >
ObjectType* Cuda::passToDevice( const ObjectType& object )
@@ -162,33 +297,59 @@ __device__ Element* Cuda::getSharedMemory()
   return CudaSharedMemory< Element >();
}

inline void
Cuda::configSetup( Config::ConfigDescription& config,
                   const String& prefix )
{
#ifdef HAVE_CUDA
   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation.", 0 );
inline void Cuda::checkDevice( const char* file_name, int line, cudaError error )
{
   if( error != cudaSuccess )
      throw Exceptions::CudaRuntimeError( error, file_name, line );
}
#endif

inline void Cuda::insertSmartPointer( SmartPointer* pointer )
{
   getSmartPointersRegister().insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
}

inline void Cuda::removeSmartPointer( SmartPointer* pointer )
{
   getSmartPointersRegister().remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() );
}

inline bool Cuda::synchronizeDevice( int deviceId )
{
#ifdef HAVE_CUDA_UNIFIED_MEMORY
   return true;
#else
   config.addEntry< int >( prefix + "cuda-device", "Choose CUDA device to run the computation (not supported on this system).", 0 );
   if( deviceId < 0 )
      deviceId = Devices::CudaDeviceInfo::getActiveDevice();
   getSmartPointersSynchronizationTimer().start();
   bool b = getSmartPointersRegister().synchronizeDevice( deviceId );
   getSmartPointersSynchronizationTimer().stop();
   return b;
#endif
}

inline bool
Cuda::setup( const Config::ParameterContainer& parameters,
             const String& prefix )
inline Timer& Cuda::getSmartPointersSynchronizationTimer()
{
   static Timer timer;
   return timer;
}

inline SmartPointersRegister& Cuda::getSmartPointersRegister()
{
   static SmartPointersRegister reg;
   return reg;
}

#ifdef HAVE_CUDA
   int cudaDevice = parameters.getParameter< int >( prefix + "cuda-device" );
   if( cudaSetDevice( cudaDevice ) != cudaSuccess )
namespace {
   std::ostream& operator << ( std::ostream& str, const dim3& d )
   {
      std::cerr << "I cannot activate CUDA device number " << cudaDevice << "." << std::endl;
      return false;
      str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
      return str;
   }
   smartPointersSynchronizationTimer.reset();
   smartPointersSynchronizationTimer.stop();
#endif
   return true;
}
#endif

// double-precision atomicAdd function for Maxwell and older GPUs
// copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions