Commit 2d5176fb authored by Jakub Klinkovský's avatar Jakub Klinkovský
Browse files

Moved (most of) static methods from TNL::Devices::Cuda as free functions into...

Moved (most of) static methods from TNL::Devices::Cuda as free functions into separate namespace TNL::Cuda

The class TNL::Devices::Cuda was too bloated, breaking the Single
Responsibility Principle. It should be used only for template
specializations and other things common to all devices.

The functions in MemoryHelpers.h are deprecated, smart pointers should
be used instead.

The functions in LaunchHelpers.h are temporary, more refactoring is
needed with respect to execution policies and custom launch parameters.
parent fed5d45c
Loading
Loading
Loading
Loading
+4 −4
Original line number Diff line number Diff line
@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
                                         const int elementsPerRow,
                                         const int gridIdx )
{
   const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   if( rowIdx >= matrix->getRows() )
      return;
   int col = rowIdx - elementsPerRow / 2;
@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix,
   typedef typename Matrix::IndexType IndexType;
   typedef typename Matrix::RealType RealType;
   Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
   dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
   dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
   const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
   const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
   for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
      if( gridIdx == cudaGrids - 1 )
         cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
         cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
      setCudaTestMatrixKernel< Matrix >
         <<< cudaGridSize, cudaBlockSize >>>
         ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
+10 −10
Original line number Diff line number Diff line
@@ -24,7 +24,7 @@

#include <TNL/Devices/Host.h>
#include <TNL/Devices/SystemInfo.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Communicators/MpiCommunicator.h>

@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata()
                       + convertToString( cacheSizes.L2 ) + ", "
                       + convertToString( cacheSizes.L3 );
#ifdef HAVE_CUDA
   const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
   const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
                             convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
   const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
   const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
                             convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
#endif
   Benchmark::MetadataMap metadata {
       { "host name", Devices::SystemInfo::getHostname() },
@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata()
       { "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
       { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
#ifdef HAVE_CUDA
       { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
       { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
       { "GPU architecture", deviceArch },
       { "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) },
       { "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
       { "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
       { "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
       { "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) },
       { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
       { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
       { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
       { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
       { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
#endif
   };

+15 −15
Original line number Diff line number Diff line
@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters,

   if( std::is_same< DeviceType, Devices::Cuda >::value )
   {
      this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer );
      this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
      this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
      this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer );
      this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer );
      this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer );
   }
   this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
   this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid,
{
   typename GridType::CoordinatesType coordinates;

   coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;        
   coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;        

   if( coordinates.x() < end.x() &&
       coordinates.y() < end.y() )
@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid,
   typedef typename GridType::IndexType IndexType;
   typedef typename GridType::RealType RealType;

   coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;     
   coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
   coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;     
      
   MeshFunction& u = *_u;
   MeshFunction& fu = *_fu;
@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time,
         CellType cell( mesh.template getData< DeviceType >() );
         dim3 cudaBlockSize( 16, 16 );
         dim3 cudaBlocks;
         cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
         cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
         const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
         const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
         cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
         cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
         const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
         const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y );
         
         //std::cerr << "Setting boundary conditions..." << std::endl;

@@ -762,10 +762,10 @@ template< typename Mesh,
HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >::
~HeatEquationBenchmarkProblem()
{
   if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh );
   if( this->cudaBoundaryConditions )  Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions );
   if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide );
   if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator );
   if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh );
   if( this->cudaBoundaryConditions )  Cuda::freeFromDevice( this->cudaBoundaryConditions );
   if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide );
   if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator );
}


+1 −1
Original line number Diff line number Diff line
@@ -12,7 +12,7 @@

#include <TNL/Meshes/Grid.h>
#include <TNL/Pointers/SharedPointer.h>
#include <TNL/CudaStreamPool.h>
#include <TNL/Cuda/StreamPool.h>

namespace TNL {

+11 −11
Original line number Diff line number Diff line
@@ -126,8 +126,8 @@ _GridTraverser2D(
   typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
   typename GridType::CoordinatesType coordinates;

   coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
   coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
   coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
   
   if( coordinates <= end )
   {
@@ -173,7 +173,7 @@ _GridTraverser2DBoundary(
   Index entitiesAlongX = endX - beginX + 1;
   Index entitiesAlongY = endY - beginY;
   
   Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
   Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
   if( threadId < entitiesAlongX )
   {
      GridEntity entity( *grid, 
@@ -244,12 +244,12 @@ processEntities(
      dim3 cudaBlockSize( 256 );      
      dim3 cudaBlocksCount, cudaGridsCount;
      IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
      dim3 gridIdx, cudaGridSize;
      Devices::Cuda::synchronizeDevice();
      for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
      {
         Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
         Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
         _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
@@ -266,11 +266,11 @@ processEntities(
   {
      dim3 cudaBlockSize( 16, 16 );
      dim3 cudaBlocksCount, cudaGridsCount;
      Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
      Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
                          end.x() - begin.x() + 1,
                          end.y() - begin.y() + 1 );
      
      auto& pool = CudaStreamPool::getInstance();
      auto& pool = Cuda::StreamPool::getInstance();
      const cudaStream_t& s = pool.getStream( stream );

      Devices::Cuda::synchronizeDevice();
@@ -278,8 +278,8 @@ processEntities(
      for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
         for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
         {
            Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
	    //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
            Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
	    //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
            TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
               <<< cudaGridSize, cudaBlockSize, 0, s >>>
               ( &gridPointer.template getData< Devices::Cuda >(),
Loading