Commit 2d5176fb authored by Jakub Klinkovský's avatar Jakub Klinkovský

Moved (most of) static methods from TNL::Devices::Cuda as free functions into...

Moved (most of) static methods from TNL::Devices::Cuda as free functions into separate namespace TNL::Cuda

The class TNL::Devices::Cuda was too bloated, breaking the Single
Responsibility Principle. It should be used only for template
specializations and other things common to all devices.

The functions in MemoryHelpers.h are deprecated, smart pointers should
be used instead.

The functions in LaunchHelpers.h are temporary, more refactoring is
needed with respect to execution policies and custom launch parameters.
parent fed5d45c
...@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix, ...@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
const int elementsPerRow, const int elementsPerRow,
const int gridIdx ) const int gridIdx )
{ {
const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
if( rowIdx >= matrix->getRows() ) if( rowIdx >= matrix->getRows() )
return; return;
int col = rowIdx - elementsPerRow / 2; int col = rowIdx - elementsPerRow / 2;
...@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix, ...@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix,
typedef typename Matrix::IndexType IndexType; typedef typename Matrix::IndexType IndexType;
typedef typename Matrix::RealType RealType; typedef typename Matrix::RealType RealType;
Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
if( gridIdx == cudaGrids - 1 ) if( gridIdx == cudaGrids - 1 )
cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
setCudaTestMatrixKernel< Matrix > setCudaTestMatrixKernel< Matrix >
<<< cudaGridSize, cudaBlockSize >>> <<< cudaGridSize, cudaBlockSize >>>
( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include <TNL/Devices/Host.h> #include <TNL/Devices/Host.h>
#include <TNL/Devices/SystemInfo.h> #include <TNL/Devices/SystemInfo.h>
#include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Config/ConfigDescription.h> #include <TNL/Config/ConfigDescription.h>
#include <TNL/Communicators/MpiCommunicator.h> #include <TNL/Communicators/MpiCommunicator.h>
...@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata() ...@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata()
+ convertToString( cacheSizes.L2 ) + ", " + convertToString( cacheSizes.L2 ) + ", "
+ convertToString( cacheSizes.L3 ); + convertToString( cacheSizes.L3 );
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
#endif #endif
Benchmark::MetadataMap metadata { Benchmark::MetadataMap metadata {
{ "host name", Devices::SystemInfo::getHostname() }, { "host name", Devices::SystemInfo::getHostname() },
...@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata() ...@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata()
{ "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) }, { "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
{ "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
{ "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
{ "GPU architecture", deviceArch }, { "GPU architecture", deviceArch },
{ "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) }, { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
{ "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) }, { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
{ "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) }, { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
{ "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) }, { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
{ "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) }, { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
#endif #endif
}; };
......
...@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters, ...@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters,
if( std::is_same< DeviceType, Devices::Cuda >::value ) if( std::is_same< DeviceType, Devices::Cuda >::value )
{ {
this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer ); this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer );
this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer ); this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer );
this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer ); this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer );
} }
this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
...@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid, ...@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid,
{ {
typename GridType::CoordinatesType coordinates; typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
if( coordinates.x() < end.x() && if( coordinates.x() < end.x() &&
coordinates.y() < end.y() ) coordinates.y() < end.y() )
...@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid, ...@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid,
typedef typename GridType::IndexType IndexType; typedef typename GridType::IndexType IndexType;
typedef typename GridType::RealType RealType; typedef typename GridType::RealType RealType;
coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
MeshFunction& u = *_u; MeshFunction& u = *_u;
MeshFunction& fu = *_fu; MeshFunction& fu = *_fu;
...@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time, ...@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time,
CellType cell( mesh.template getData< DeviceType >() ); CellType cell( mesh.template getData< DeviceType >() );
dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlockSize( 16, 16 );
dim3 cudaBlocks; dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y );
//std::cerr << "Setting boundary conditions..." << std::endl; //std::cerr << "Setting boundary conditions..." << std::endl;
...@@ -762,10 +762,10 @@ template< typename Mesh, ...@@ -762,10 +762,10 @@ template< typename Mesh,
HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >:: HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >::
~HeatEquationBenchmarkProblem() ~HeatEquationBenchmarkProblem()
{ {
if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh ); if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh );
if( this->cudaBoundaryConditions ) Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions ); if( this->cudaBoundaryConditions ) Cuda::freeFromDevice( this->cudaBoundaryConditions );
if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide ); if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide );
if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator ); if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator );
} }
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
#include <TNL/Meshes/Grid.h> #include <TNL/Meshes/Grid.h>
#include <TNL/Pointers/SharedPointer.h> #include <TNL/Pointers/SharedPointer.h>
#include <TNL/CudaStreamPool.h> #include <TNL/Cuda/StreamPool.h>
namespace TNL { namespace TNL {
......
...@@ -126,8 +126,8 @@ _GridTraverser2D( ...@@ -126,8 +126,8 @@ _GridTraverser2D(
typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates; typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
if( coordinates <= end ) if( coordinates <= end )
{ {
...@@ -173,7 +173,7 @@ _GridTraverser2DBoundary( ...@@ -173,7 +173,7 @@ _GridTraverser2DBoundary(
Index entitiesAlongX = endX - beginX + 1; Index entitiesAlongX = endX - beginX + 1;
Index entitiesAlongY = endY - beginY; Index entitiesAlongY = endY - beginY;
Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
if( threadId < entitiesAlongX ) if( threadId < entitiesAlongX )
{ {
GridEntity entity( *grid, GridEntity entity( *grid,
...@@ -244,12 +244,12 @@ processEntities( ...@@ -244,12 +244,12 @@ processEntities(
dim3 cudaBlockSize( 256 ); dim3 cudaBlockSize( 256 );
dim3 cudaBlocksCount, cudaGridsCount; dim3 cudaBlocksCount, cudaGridsCount;
IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 ); IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
dim3 gridIdx, cudaGridSize; dim3 gridIdx, cudaGridSize;
Devices::Cuda::synchronizeDevice(); Devices::Cuda::synchronizeDevice();
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
{ {
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
_GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize >>> <<< cudaGridSize, cudaBlockSize >>>
( &gridPointer.template getData< Devices::Cuda >(), ( &gridPointer.template getData< Devices::Cuda >(),
...@@ -266,11 +266,11 @@ processEntities( ...@@ -266,11 +266,11 @@ processEntities(
{ {
dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlockSize( 16, 16 );
dim3 cudaBlocksCount, cudaGridsCount; dim3 cudaBlocksCount, cudaGridsCount;
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
end.x() - begin.x() + 1, end.x() - begin.x() + 1,
end.y() - begin.y() + 1 ); end.y() - begin.y() + 1 );
auto& pool = CudaStreamPool::getInstance(); auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream ); const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice(); Devices::Cuda::synchronizeDevice();
...@@ -278,8 +278,8 @@ processEntities( ...@@ -278,8 +278,8 @@ processEntities(
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
{ {
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s >>> <<< cudaGridSize, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(), ( &gridPointer.template getData< Devices::Cuda >(),
......
...@@ -176,10 +176,10 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, ...@@ -176,10 +176,10 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
{ {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 ); dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
localResidue = 0.0; localResidue = 0.0;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
...@@ -187,7 +187,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, ...@@ -187,7 +187,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
const IndexType gridOffset = gridIdx * threadsPerGrid; const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x ); const IndexType currentGridSize = Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize, updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize,
tau, tau,
......
...@@ -290,10 +290,10 @@ void Merson< Problem, SolverMonitor >::computeKFunctions( DofVectorPointer& u, ...@@ -290,10 +290,10 @@ void Merson< Problem, SolverMonitor >::computeKFunctions( DofVectorPointer& u,
{ {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 ); dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
this->problem->getExplicitUpdate( time, tau, u, k1 ); this->problem->getExplicitUpdate( time, tau, u, k1 );
cudaDeviceSynchronize(); cudaDeviceSynchronize();
...@@ -384,10 +384,10 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con ...@@ -384,10 +384,10 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con
{ {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 ); dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{ {
...@@ -439,10 +439,10 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time, ...@@ -439,10 +439,10 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time,
{ {
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 ); dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
localResidue = 0.0; localResidue = 0.0;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
......
...@@ -10,7 +10,6 @@ using namespace TNL; ...@@ -10,7 +10,6 @@ using namespace TNL;
using namespace TNL::Containers; using namespace TNL::Containers;
using namespace TNL::Meshes; using namespace TNL::Meshes;
using namespace TNL::Functions; using namespace TNL::Functions;
using namespace TNL::Devices;
int main(int argc, char ** argv) int main(int argc, char ** argv)
{ {
...@@ -28,9 +27,9 @@ int main(int argc, char ** argv) ...@@ -28,9 +27,9 @@ int main(int argc, char ** argv)
time.start(); time.start();
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
using Device=Cuda; using Device=Devices::Cuda;
#else #else
using Device=Host; using Device=Devices::Host;
#endif #endif
using MeshType= Grid<2, double,Device,int>; using MeshType= Grid<2, double,Device,int>;
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
#pragma once #pragma once
#include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL { namespace TNL {
namespace Allocators { namespace Allocators {
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
#pragma once #pragma once
#include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL { namespace TNL {
namespace Allocators { namespace Allocators {
......
...@@ -12,7 +12,9 @@ ...@@ -12,7 +12,9 @@
#pragma once #pragma once
#include <TNL/Devices/Cuda.h> #include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL { namespace TNL {
namespace Allocators { namespace Allocators {
......
...@@ -120,7 +120,7 @@ ...@@ -120,7 +120,7 @@
#include <iostream> #include <iostream>
#include <stdio.h> #include <stdio.h>
#include <TNL/Devices/CudaCallable.h> #include <TNL/Cuda/CudaCallable.h>
#include <TNL/Debugging/StackBacktrace.h> #include <TNL/Debugging/StackBacktrace.h>
namespace TNL { namespace TNL {
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
#include <unistd.h> // getpid #include <unistd.h> // getpid
#ifdef HAVE_CUDA #ifdef HAVE_CUDA
#include <TNL/Devices/Cuda.h> #include <TNL/Cuda/CheckDevice.h>
typedef struct __attribute__((__packed__)) { typedef struct __attribute__((__packed__)) {
char name[MPI_MAX_PROCESSOR_NAME]; char name[MPI_MAX_PROCESSOR_NAME];
......
...@@ -14,7 +14,8 @@ ...@@ -14,7 +14,8 @@
#include <TNL/Assert.h> #include <TNL/Assert.h>
#include <TNL/Math.h> #include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h> #include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaSupportMissing.h>
...@@ -52,7 +53,7 @@ CudaMultireductionKernel( const Result zero, ...@@ -52,7 +53,7 @@ CudaMultireductionKernel( const Result zero,
const int n, const int n,
Result* output ) Result* output )
{ {
Result* sdata = Devices::Cuda::getSharedMemory< Result >(); Result* sdata = Cuda::getSharedMemory< Result >();
// Get the thread id (tid), global thread id (gid) and gridSize. // Get the thread id (tid), global thread id (gid) and gridSize.
const Index tid = threadIdx.y * blockDim.x + threadIdx.x; const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
...@@ -160,10 +161,10 @@ CudaMultireductionKernelLauncher( const Result zero, ...@@ -160,10 +161,10 @@ CudaMultireductionKernelLauncher( const Result zero,
// where blocksPerMultiprocessor is determined according to the number of // where blocksPerMultiprocessor is determined according to the number of
// available registers on the multiprocessor. // available registers on the multiprocessor.
// On Tesla K40c, desGridSize = 8 * 15 = 120. // On Tesla K40c, desGridSize = 8 * 15 = 120.
const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); const int activeDevice = Cuda::DeviceInfo::getActiveDevice();
const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) const int blocksdPerMultiprocessor = Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice )
/ ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread ); / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); const int desGridSizeX = blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice );
dim3 blockSize, gridSize; dim3 blockSize, gridSize;
// version A: max 16 rows of threads // version A: max 16 rows of threads
...@@ -189,10 +190,10 @@ CudaMultireductionKernelLauncher( const Result zero, ...@@ -189,10 +190,10 @@ CudaMultireductionKernelLauncher( const Result zero,
while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock ) while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
blockSize.x /= 2; blockSize.x /= 2;
gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX ); gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y );
if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) { if( gridSize.y > (unsigned) Cuda::getMaxGridSize() ) {
std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl; std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
throw 1; throw 1;
} }
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <TNL/Devices/Cuda.h> #include <TNL/Cuda/CheckDevice.h>
#include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaSupportMissing.h>
......
...@@ -14,7 +14,8 @@ ...@@ -14,7 +14,8 @@
#include <TNL/Assert.h> #include <TNL/Assert.h>
#include <TNL/Math.h> #include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>