Commit adae7be2 authored by Tomáš Oberhuber's avatar Tomáš Oberhuber
Browse files

Merge branch 'cuda-traverser-optimizations'

parents 4d7c5d66 ba75e97d
Loading
Loading
Loading
Loading
+60 −0
Original line number Diff line number Diff line
/***************************************************************************
                          HeatEquationBuildConfigTag.h  -  description
                             -------------------
    begin                : Jul 7, 2014
    copyright            : (C) 2014 by Tomas Oberhuber
    email                : tomas.oberhuber@fjfi.cvut.cz
 ***************************************************************************/

/* See Copyright Notice in tnl/Copyright */

#pragma once

#include <TNL/Solvers/BuildConfigTags.h>

namespace TNL {
namespace Solvers {
   
class HeatEquationBuildConfig
{
   public:

      static void print() { std::cerr << "HeatEquationBuildConfig" << std::endl; }
};

/****
 * Turn off support for float and long double.
 */
template<> struct ConfigTagReal< HeatEquationBuildConfig, float > { enum { enabled = false }; };
template<> struct ConfigTagReal< HeatEquationBuildConfig, long double > { enum { enabled = false }; };

/****
 * Turn off support for short int and long int indexing.
 */
template<> struct ConfigTagIndex< HeatEquationBuildConfig, short int >{ enum { enabled = false }; };
template<> struct ConfigTagIndex< HeatEquationBuildConfig, long int >{ enum { enabled = false }; };

/****
 * Use of Grid is enabled for allowed dimensions and Real, Device and Index types.
 */
template< int Dimensions, typename Real, typename Device, typename Index >
   struct ConfigTagMesh< HeatEquationBuildConfig, Meshes::Grid< Dimensions, Real, Device, Index > >
      { enum { enabled = ConfigTagDimensions< HeatEquationBuildConfig, Dimensions >::enabled  &&
                         ConfigTagReal< HeatEquationBuildConfig, Real >::enabled &&
                         ConfigTagDevice< HeatEquationBuildConfig, Device >::enabled &&
                         ConfigTagIndex< HeatEquationBuildConfig, Index >::enabled }; };

/****
 * Please, chose your preferred time discretization  here.
 */
template<> struct ConfigTagTimeDiscretisation< HeatEquationBuildConfig, ExplicitTimeDiscretisationTag >{ enum { enabled = true }; };
template<> struct ConfigTagTimeDiscretisation< HeatEquationBuildConfig, SemiImplicitTimeDiscretisationTag >{ enum { enabled = false }; };
template<> struct ConfigTagTimeDiscretisation< HeatEquationBuildConfig, ImplicitTimeDiscretisationTag >{ enum { enabled = false }; };

/****
 * Only the Runge-Kutta-Merson solver is enabled by default.
 */
template<> struct ConfigTagExplicitSolver< HeatEquationBuildConfig, ExplicitEulerSolverTag >{ enum { enabled = false }; };

} // namespace Solvers
} // namespace TNL
+2 −0
Original line number Diff line number Diff line
@@ -21,12 +21,14 @@
#include <TNL/Functions/MeshFunction.h>
#include <TNL/Problems/HeatEquationProblem.h>
#include <TNL/Meshes/Grid.h>
#include "HeatEquationBuildConfigTag.h"

using namespace TNL;
using namespace TNL::Problems;

//typedef tnlDefaultBuildMeshConfig BuildConfig;
typedef Solvers::FastBuildConfig BuildConfig;
//typedef Solvers::HeatEquationBuildConfig BuildConfig;

template< typename MeshConfig >
class heatEquationConfig
+93 −0
Original line number Diff line number Diff line
@@ -15,6 +15,93 @@
namespace TNL {
namespace Devices {


void Cuda::setupThreads( const dim3& blockSize,
                         dim3& blocksCount,
                         dim3& gridsCount,
                         long long int xThreads,
                         long long int yThreads,
                         long long int zThreads )
{
   blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) );
   blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) );
   blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) );
   
   /****
    * TODO: Fix the following:
    * I do not known how to get max grid size in kernels :(
    * 
    * Also, this is very slow. */
   /*int currentDevice( 0 );
   cudaGetDevice( currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );
   gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
   gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
   gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
   */
   gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
   gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
   gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
}

void Cuda::setupGrid( const dim3& blocksCount,
                      const dim3& gridsCount,
                      const dim3& gridIdx,
                      dim3& gridSize )
{
   /* TODO: this is extremely slow!!!!
   int currentDevice( 0 );
   cudaGetDevice( &currentDevice );
   cudaDeviceProp properties;
   cudaGetDeviceProperties( &properties, currentDevice );*/
 
   /****
    * TODO: fix the following
   if( gridIdx.x < gridsCount.x )
      gridSize.x = properties.maxGridSize[ 0 ];
   else
      gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];
   
   if( gridIdx.y < gridsCount.y )
      gridSize.y = properties.maxGridSize[ 1 ];
   else
      gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

   if( gridIdx.z < gridsCount.z )
      gridSize.z = properties.maxGridSize[ 2 ];
   else
      gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/
   
   if( gridIdx.x < gridsCount.x - 1 )
      gridSize.x = getMaxGridSize();
   else
      gridSize.x = blocksCount.x % getMaxGridSize();
   
   if( gridIdx.y < gridsCount.y - 1 )
      gridSize.y = getMaxGridSize();
   else
      gridSize.y = blocksCount.y % getMaxGridSize();

   if( gridIdx.z < gridsCount.z - 1 )
      gridSize.z = getMaxGridSize();
   else
      gridSize.z = blocksCount.z % getMaxGridSize();
}

void Cuda::printThreadsSetup( const dim3& blockSize,
                              const dim3& blocksCount,
                              const dim3& gridSize,
                              const dim3& gridsCount,
                              std::ostream& str )
{
   str << "Block size: " << blockSize << std::endl
       << " Blocks count: " << blocksCount << std::endl
       << " Grid size: " << gridSize << std::endl
       << " Grids count: " << gridsCount << std::endl;
}


bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
{   
   if( error == cudaSuccess )
@@ -397,5 +484,11 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
   return false;
}

std::ostream& operator << ( std::ostream& str, const dim3& d )
{
   str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
   return str;
}

} // namespace Devices
} // namespace TNL
+52 −1
Original line number Diff line number Diff line
@@ -49,17 +49,67 @@ class Cuda
   static inline constexpr int getGPUTransferBufferSize();

#ifdef HAVE_CUDA
   /***
    * This function is obsolete and should be replaced by the following functions.
    */
   __device__ static inline int
   getGlobalThreadIdx( const int gridIdx = 0,
                       const int gridSize = getMaxGridSize() );   

   __device__ static inline int
   getGlobalThreadIdx_x( const dim3& gridIdx );

   __device__ static inline int
   getGlobalThreadIdx_y( const dim3& gridIdx );

   __device__ static inline int
   getGlobalThreadIdx_z( const dim3& gridIdx );   
#endif

   /****
    * This functions helps to count number of CUDA blocks depending on the 
    * number of the CUDA threads and the block size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfBlocks( const int threads,
                                 const int blockSize );

   /****
    * This functions helps to count number of CUDA grids depending on the 
    * number of the CUDA blocks and maximum grid size.
    * It is obsolete and it will be replaced by setupThreads.
    */
   static int getNumberOfGrids( const int blocks,
                                const int gridSize = getMaxGridSize() );
   
#ifdef HAVE_CUDA   
   /*! This method sets up gridSize and computes number of grids depending
    *  on total number of CUDA threads.
    */
   static void setupThreads( const dim3& blockSize,
                             dim3& blocksCount,
                             dim3& gridsCount,
                             long long int xThreads,
                             long long int yThreads = 0,
                             long long int zThreads = 0 );
   
   /*! This method sets up grid size when one iterates over more grids.
    * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum
    * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid
    * in the last loop i.e. blocksCount.? % maxGridSize.?.
    */
   static void setupGrid( const dim3& blocksCount,
                          const dim3& gridsCount,
                          const dim3& gridIdx,
                          dim3& gridSize );
   
   static void printThreadsSetup( const dim3& blockSize,
                                  const dim3& blocksCount,
                                  const dim3& gridSize,
                                  const dim3& gridsCount,
                                  std::ostream& str = std::cout );
#endif   

   template< typename ObjectType >
   static ObjectType* passToDevice( const ObjectType& object );

@@ -135,6 +185,7 @@ class Cuda

#ifdef HAVE_CUDA
#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() )
std::ostream& operator << ( std::ostream& str, const dim3& d );
#else
#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice()
#endif
+16 −0
Original line number Diff line number Diff line
@@ -53,6 +53,22 @@ __device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gri
{
   return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
}

__device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx )
{
   return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
}

__device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx )
{
   return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
}

__device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
{
   return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
}

#endif


Loading