Commit d359d3d1 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Implemented helper functions for setup of CUDA grids.

parent 43fdac63
Loading
Loading
Loading
Loading
+101 −0
Original line number Diff line number Diff line
@@ -15,6 +15,101 @@
namespace TNL {
namespace Devices {


void Cuda::setupThreads( const dim3& blockSize,
                         dim3& blocksCount,
                         dim3& gridsCount,
                         long long int xThreads,
                         long long int yThreads,
                         long long int zThreads )
{
   if( blockSize.x )
      blocksCount.x = xThreads / blockSize.x + ( xThreads % blockSize.x != 0 );
   else
      blocksCount.x = 0;
   if( blockSize.y )
      blocksCount.y = yThreads / blockSize.y + ( yThreads % blockSize.y != 0 );
   else
      blocksCount.y = 0;
   if( blockSize.z )
      blocksCount.z = xThreads / blockSize.z + ( zThreads % blockSize.z != 0 );
   else
      blocksCount.z = 0;
   
   /****
    * TODO: Fix the following:
    * I do not known how to get max grid size in kernels :(
    * 
   int currentDevice( 0 );
   cudaGetDevice( currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );
   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
   */
   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );   
}

void Cuda::setupGrid( const dim3& blocksCount,
                      const dim3& gridsCount,
                      const dim3& gridIdx,
                      dim3& gridSize )
{
   int currentDevice( 0 );
   cudaGetDevice( &currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );
 
   /****
    * TODO: fix the following
   if( gridIdx.x < gridsCount.x )
      gridSize.x = properties.maxGridSize[ 0 ];
   else
      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
   
   if( gridIdx.y < gridsCount.y )
      gridSize.y = properties.maxGridSize[ 1 ];
   else
      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

   if( gridIdx.z < gridsCount.z )
      gridSize.z = properties.maxGridSize[ 2 ];
   else
      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
   
   if( gridIdx.x < gridsCount.x )
      gridSize.x = getMaxGridSize();
   else
      gridSize.x = blocksCount.x % getMaxGridSize();
   
   if( gridIdx.y < gridsCount.y )
      gridSize.y = getMaxGridSize();
   else
      gridSize.y = blocksCount.y % getMaxGridSize();

   if( gridIdx.z < gridsCount.z )
      gridSize.z = getMaxGridSize();
   else
      gridSize.z = blocksCount.z % getMaxGridSize();
   
}

void Cuda::printThreadsSetup( const dim3& blockSize,
                              const dim3& blocksCount,
                              const dim3& gridSize,
                              const dim3& gridsCount,
                              std::ostream& str )
{
   /*str << "Block size: " << blockSize << std::endl
       << " Blocks count: " << blocksCount << std::endl
       << " Grid size: " << gridSize << std::endl
       << " Grids count: " << gridsCount << std::endl;*/
}


bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
{   
   if( error == cudaSuccess )
@@ -397,5 +492,11 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
   return false;
}

std::ostream& operator << ( std::ostream& str, const dim3& d )
{
   str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
   return str;
}

} // namespace Devices
} // namespace TNL
+52 −1
Original line number Diff line number Diff line
@@ -49,17 +49,67 @@ class Cuda
   static inline constexpr int getGPUTransferBufferSize();

#ifdef HAVE_CUDA
   /***
    * This function is obsolete and should be replaced by the following functions.
    */
   __device__ static inline int
   getGlobalThreadIdx( const int gridIdx = 0,
                       const int gridSize = getMaxGridSize() );   

   __device__ static inline int
   getGlobalThreadIdx_x( const dim3& gridIdx );

   __device__ static inline int
   getGlobalThreadIdx_y( const dim3& gridIdx );

   __device__ static inline int
   getGlobalThreadIdx_z( const dim3& gridIdx );   
#endif

   /****
    * This functions helps to count number of CUDA blocks depending on the 
    * number of the CUDA threads and the block size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfBlocks( const int threads,
                                 const int blockSize );

   /****
    * This functions helps to count number of CUDA grids depending on the 
    * number of the CUDA blocks and maximum grid size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfGrids( const int blocks,
                                const int gridSize = getMaxGridSize() );
   
#ifdef HAVE_CUDA   
   /*! This method sets up gridSize and computes number of grids depending
    *  on total number of CUDA threads.
    */
   static void setupThreads( const dim3& blockSize,
                             dim3& blocksCount,
                             dim3& gridsCount,
                             long long int xThreads,
                             long long int yThreads = 0,
                             long long int zThreads = 0 );
   
   /*! This method sets up grid size when one iterates over more grids.
    * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum
    * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid
    * in the last loop i.e. blocksCount.? % maxGridSize.?.
    */
   static void setupGrid( const dim3& blocksCount,
                          const dim3& gridsCount,
                          const dim3& gridIdx,
                          dim3& gridSize );
   
   static void printThreadsSetup( const dim3& blockSize,
                                  const dim3& blocksCount,
                                  const dim3& gridSize,
                                  const dim3& gridsCount,
                                  std::ostream& str = std::cout );
#endif   

   template< typename ObjectType >
   static ObjectType* passToDevice( const ObjectType& object );

@@ -135,6 +185,7 @@ class Cuda

#ifdef HAVE_CUDA
#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() )
std::ostream& operator << ( std::ostream& str, const dim3& d );
#else
#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice()
#endif
+16 −0
Original line number Diff line number Diff line
@@ -53,6 +53,22 @@ __device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gri
{
   return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
}

__device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx )
{
   return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
}

__device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx )
{
   return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
}

__device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
{
   return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
}

#endif


+20 −12
Original line number Diff line number Diff line
@@ -357,13 +357,13 @@ GridTraverser2DBoundaryAlongX(
   const Index beginX,
   const Index endX,
   const Index fixedY,
   const Index gridIdx,
   const dim3 gridIdx,
   const GridEntityParameters... gridEntityParameters )
{
   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
   typename GridType::CoordinatesType coordinates;

   coordinates.x() = beginX + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   coordinates.y() = fixedY;  
   
   if( coordinates.x() <= endX )
@@ -391,14 +391,14 @@ GridTraverser2DBoundaryAlongY(
   const Index beginY,
   const Index endY,
   const Index fixedX,
   const Index gridIdx,
   const dim3 gridIdx,
   const GridEntityParameters... gridEntityParameters )
{
   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
   typename GridType::CoordinatesType coordinates;

   coordinates.x() = fixedX;
   coordinates.y() = beginY + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   
   if( coordinates.y() <= endY )
   {
@@ -438,23 +438,30 @@ processEntities(
      ( GridEntity::getDimensions() == 2 || GridEntity::getDimensions() == 0 ) )
   {
      dim3 cudaBlockSize( 256 );
      const IndexType entitiesAlongX = end.x() - begin.x() + 1;
      dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
           cudaBlocksCountAlongY, cudaGridsCountAlongY;
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );
      
      /*const IndexType entitiesAlongX = end.x() - begin.x() + 1;
      const IndexType entitiesAlongY = end.y() - begin.y() - 1;
      dim3 cudaBlocksAlongX, cudaBlocksAlongY;
      cudaBlocksAlongX.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongX, cudaBlockSize.x );
      cudaBlocksAlongY.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongY, cudaBlockSize.x );
      const IndexType cudaGridsAlongX = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongX.x );
      const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );
      const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );*/
      
      auto& pool = CudaStreamPool::getInstance();
      Devices::Cuda::synchronizeDevice();
      
      const cudaStream_t& s1 = pool.getStream( stream );
      const cudaStream_t& s2 = pool.getStream( stream + 1 );
      for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ )
      dim3 gridIdx, cudaGridSize;
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
      {
         Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaBlocksAlongX, cudaBlockSize, 0, s1 >>>
               <<< cudaGridSize, cudaBlockSize, 0, s1 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.x(),
@@ -463,7 +470,7 @@ processEntities(
                 gridIdx,
                 gridEntityParameters... );
         GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaBlocksAlongX, cudaBlockSize, 0, s2 >>>
               <<< cudaGridSize, cudaBlockSize, 0, s2 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.x(),
@@ -474,10 +481,11 @@ processEntities(
      }
      const cudaStream_t& s3 = pool.getStream( stream + 2 );
      const cudaStream_t& s4 = pool.getStream( stream + 3 );
      for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ )
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
      {
         Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaBlocksAlongY, cudaBlockSize, 0, s3 >>>
               <<< cudaGridSize, cudaBlockSize, 0, s3 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.y() + 1,
@@ -486,7 +494,7 @@ processEntities(
                 gridIdx,
                 gridEntityParameters... );
         GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaBlocksAlongY, cudaBlockSize, 0, s4 >>>
               <<< cudaGridSize, cudaBlockSize, 0, s4 >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
                 &userDataPointer.template modifyData< Devices::Cuda >(),
                 begin.y() + 1,