Commit 2d5176fb authored by Jakub Klinkovský's avatar Jakub Klinkovský

Moved (most of) static methods from TNL::Devices::Cuda as free functions into...

Moved (most of) static methods from TNL::Devices::Cuda as free functions into separate namespace TNL::Cuda

The class TNL::Devices::Cuda was too bloated, breaking the Single
Responsibility Principle. It should be used only for template
specializations and other things common to all devices.

The functions in MemoryHelpers.h are deprecated, smart pointers should
be used instead.

The functions in LaunchHelpers.h are temporary, more refactoring is
needed with respect to execution policies and custom launch parameters.
parent fed5d45c
......@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
const int elementsPerRow,
const int gridIdx )
{
const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
if( rowIdx >= matrix->getRows() )
return;
int col = rowIdx - elementsPerRow / 2;
......@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix,
typedef typename Matrix::IndexType IndexType;
typedef typename Matrix::RealType RealType;
Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
if( gridIdx == cudaGrids - 1 )
cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
setCudaTestMatrixKernel< Matrix >
<<< cudaGridSize, cudaBlockSize >>>
( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );
......
......@@ -24,7 +24,7 @@
#include <TNL/Devices/Host.h>
#include <TNL/Devices/SystemInfo.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Config/ConfigDescription.h>
#include <TNL/Communicators/MpiCommunicator.h>
......@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata()
+ convertToString( cacheSizes.L2 ) + ", "
+ convertToString( cacheSizes.L3 );
#ifdef HAVE_CUDA
const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
#endif
Benchmark::MetadataMap metadata {
{ "host name", Devices::SystemInfo::getHostname() },
......@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata()
{ "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
{ "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
#ifdef HAVE_CUDA
{ "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
{ "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
{ "GPU architecture", deviceArch },
{ "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) },
{ "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
{ "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
{ "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
{ "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) },
{ "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
{ "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
{ "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
{ "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
{ "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
#endif
};
......
......@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters,
if( std::is_same< DeviceType, Devices::Cuda >::value )
{
this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer );
this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer );
this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer );
this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer );
}
this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
......@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid,
{
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
if( coordinates.x() < end.x() &&
coordinates.y() < end.y() )
......@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid,
typedef typename GridType::IndexType IndexType;
typedef typename GridType::RealType RealType;
coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
MeshFunction& u = *_u;
MeshFunction& fu = *_fu;
......@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time,
CellType cell( mesh.template getData< DeviceType >() );
dim3 cudaBlockSize( 16, 16 );
dim3 cudaBlocks;
cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y );
//std::cerr << "Setting boundary conditions..." << std::endl;
......@@ -762,10 +762,10 @@ template< typename Mesh,
HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >::
~HeatEquationBenchmarkProblem()
{
if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh );
if( this->cudaBoundaryConditions ) Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions );
if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide );
if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator );
if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh );
if( this->cudaBoundaryConditions ) Cuda::freeFromDevice( this->cudaBoundaryConditions );
if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide );
if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator );
}
......
......@@ -12,7 +12,7 @@
#include <TNL/Meshes/Grid.h>
#include <TNL/Pointers/SharedPointer.h>
#include <TNL/CudaStreamPool.h>
#include <TNL/Cuda/StreamPool.h>
namespace TNL {
......
......@@ -126,8 +126,8 @@ _GridTraverser2D(
typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
typename GridType::CoordinatesType coordinates;
coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );
if( coordinates <= end )
{
......@@ -173,7 +173,7 @@ _GridTraverser2DBoundary(
Index entitiesAlongX = endX - beginX + 1;
Index entitiesAlongY = endY - beginY;
Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
if( threadId < entitiesAlongX )
{
GridEntity entity( *grid,
......@@ -244,12 +244,12 @@ processEntities(
dim3 cudaBlockSize( 256 );
dim3 cudaBlocksCount, cudaGridsCount;
IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
dim3 gridIdx, cudaGridSize;
Devices::Cuda::synchronizeDevice();
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
{
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
_GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize >>>
( &gridPointer.template getData< Devices::Cuda >(),
......@@ -266,11 +266,11 @@ processEntities(
{
dim3 cudaBlockSize( 16, 16 );
dim3 cudaBlocksCount, cudaGridsCount;
Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
end.x() - begin.x() + 1,
end.y() - begin.y() + 1 );
Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
end.x() - begin.x() + 1,
end.y() - begin.y() + 1 );
auto& pool = CudaStreamPool::getInstance();
auto& pool = Cuda::StreamPool::getInstance();
const cudaStream_t& s = pool.getStream( stream );
Devices::Cuda::synchronizeDevice();
......@@ -278,8 +278,8 @@ processEntities(
for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
{
Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
//Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
<<< cudaGridSize, cudaBlockSize, 0, s >>>
( &gridPointer.template getData< Devices::Cuda >(),
......
......@@ -176,10 +176,10 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
{
#ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
localResidue = 0.0;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
......@@ -187,7 +187,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u,
const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType );
const IndexType gridOffset = gridIdx * threadsPerGrid;
const IndexType currentSize = min( size - gridOffset, threadsPerGrid );
const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
const IndexType currentGridSize = Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x );
updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize,
tau,
......
......@@ -290,10 +290,10 @@ void Merson< Problem, SolverMonitor >::computeKFunctions( DofVectorPointer& u,
{
#ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
this->problem->getExplicitUpdate( time, tau, u, k1 );
cudaDeviceSynchronize();
......@@ -384,10 +384,10 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con
{
#ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
{
......@@ -439,10 +439,10 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time,
{
#ifdef HAVE_CUDA
dim3 cudaBlockSize( 512 );
const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x;
const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x );
const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks );
this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) );
const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x;
localResidue = 0.0;
for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ )
......
......@@ -10,7 +10,6 @@ using namespace TNL;
using namespace TNL::Containers;
using namespace TNL::Meshes;
using namespace TNL::Functions;
using namespace TNL::Devices;
int main(int argc, char ** argv)
{
......@@ -28,9 +27,9 @@ int main(int argc, char ** argv)
time.start();
#ifdef HAVE_CUDA
using Device=Cuda;
using Device=Devices::Cuda;
#else
using Device=Host;
using Device=Devices::Host;
#endif
using MeshType= Grid<2, double,Device,int>;
......
......@@ -12,7 +12,9 @@
#pragma once
#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL {
namespace Allocators {
......
......@@ -12,7 +12,9 @@
#pragma once
#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL {
namespace Allocators {
......
......@@ -12,7 +12,9 @@
#pragma once
#include <TNL/Devices/Cuda.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
#include <TNL/Cuda/CheckDevice.h>
namespace TNL {
namespace Allocators {
......
......@@ -120,7 +120,7 @@
#include <iostream>
#include <stdio.h>
#include <TNL/Devices/CudaCallable.h>
#include <TNL/Cuda/CudaCallable.h>
#include <TNL/Debugging/StackBacktrace.h>
namespace TNL {
......
......@@ -24,7 +24,7 @@
#include <unistd.h> // getpid
#ifdef HAVE_CUDA
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CheckDevice.h>
typedef struct __attribute__((__packed__)) {
char name[MPI_MAX_PROCESSOR_NAME];
......
......@@ -14,7 +14,8 @@
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
......@@ -52,7 +53,7 @@ CudaMultireductionKernel( const Result zero,
const int n,
Result* output )
{
Result* sdata = Devices::Cuda::getSharedMemory< Result >();
Result* sdata = Cuda::getSharedMemory< Result >();
// Get the thread id (tid), global thread id (gid) and gridSize.
const Index tid = threadIdx.y * blockDim.x + threadIdx.x;
......@@ -160,10 +161,10 @@ CudaMultireductionKernelLauncher( const Result zero,
// where blocksPerMultiprocessor is determined according to the number of
// available registers on the multiprocessor.
// On Tesla K40c, desGridSize = 8 * 15 = 120.
const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice();
const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
const int activeDevice = Cuda::DeviceInfo::getActiveDevice();
const int blocksdPerMultiprocessor = Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice )
/ ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread );
const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice );
const int desGridSizeX = blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice );
dim3 blockSize, gridSize;
// version A: max 16 rows of threads
......@@ -189,10 +190,10 @@ CudaMultireductionKernelLauncher( const Result zero,
while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock )
blockSize.x /= 2;
gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y );
gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX );
gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y );
if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) {
if( gridSize.y > (unsigned) Cuda::getMaxGridSize() ) {
std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl;
throw 1;
}
......
......@@ -14,7 +14,7 @@
#include <stdlib.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CheckDevice.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
......
......@@ -14,7 +14,8 @@
#include <TNL/Assert.h>
#include <TNL/Math.h>
#include <TNL/Devices/CudaDeviceInfo.h>
#include <TNL/Cuda/DeviceInfo.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Containers/Algorithms/CudaReductionBuffer.h>
#include <TNL/Containers/Algorithms/ArrayOperations.h>
#include <TNL/Exceptions/CudaSupportMissing.h>
......@@ -52,7 +53,7 @@ CudaReductionKernel( const Result zero,
const Index size,
Result* output )
{
Result* sdata = Devices::Cuda::getSharedMemory< Result >();
Result* sdata = Cuda::getSharedMemory< Result >();
// Get the thread id (tid), global thread id (gid) and gridSize.
const Index tid = threadIdx.x;
......@@ -147,7 +148,7 @@ CudaReductionWithArgumentKernel( const Result zero,
Index* idxOutput,
const Index* idxInput = nullptr )
{
Result* sdata = Devices::Cuda::getSharedMemory< Result >();
Result* sdata = Cuda::getSharedMemory< Result >();
Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] );
// Get the thread id (tid), global thread id (gid) and gridSize.
......@@ -282,11 +283,11 @@ struct CudaReductionKernelLauncher
// It seems to be better to map only one CUDA block per one multiprocessor or maybe
// just slightly more. Therefore we omit blocksdPerMultiprocessor in the following.
CudaReductionKernelLauncher( const Index size )
: activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ),
blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice )
: activeDevice( Cuda::DeviceInfo::getActiveDevice() ),
blocksdPerMultiprocessor( Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice )
/ ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ),
//desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ),
//desGridSize( blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ),
desGridSize( Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ),
originalSize( size )
{
}
......@@ -402,7 +403,7 @@ struct CudaReductionKernelLauncher
#ifdef HAVE_CUDA
dim3 blockSize, gridSize;
blockSize.x = Reduction_maxThreadsPerBlock;
gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
// when there is only one warp per blockSize.x, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
......@@ -510,7 +511,7 @@ struct CudaReductionKernelLauncher
#ifdef HAVE_CUDA
dim3 blockSize, gridSize;
blockSize.x = Reduction_maxThreadsPerBlock;
gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize );
// when there is only one warp per blockSize.x, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
......
......@@ -13,7 +13,7 @@
#include <iostream>
#include <TNL/Math.h>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/SharedMemory.h>
#include <TNL/Exceptions/CudaBadAlloc.h>
#include <TNL/Containers/Array.h>
......@@ -36,8 +36,8 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
Real* output,
Real* auxArray )
{
Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >();
Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ];
Real* sharedData = TNL::Cuda::getSharedMemory< Real >();
Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ];
Real* warpSums = &auxData[ blockDim.x ];
const Index lastElementIdx = size - blockIdx.x * elementsInBlock;
......@@ -54,7 +54,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
sharedData[ 0 ] = zero;
while( idx < elementsInBlock && blockOffset + idx < size )
{
sharedData[ Devices::Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ];
idx += blockDim.x;
}
}
......@@ -62,7 +62,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
{
while( idx < elementsInBlock && blockOffset + idx < size )
{
sharedData[ Devices::Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
sharedData[ Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ];
idx += blockDim.x;
}
}
......@@ -78,33 +78,33 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
if( chunkOffset < lastElementInBlock )
{
auxData[ threadIdx.x ] =
sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ];
sharedData[ Cuda::getInterleaving( chunkOffset ) ];
}
int chunkPointer = 1;
while( chunkPointer < chunkSize &&
chunkOffset + chunkPointer < lastElementInBlock )
{
sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] =
reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ],
sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] );
auxData[ threadIdx.x ] =
sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ];
chunkPointer++;
}
/***
* Perform the parallel prefix-sum inside warps.
*/
const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize();
const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize();
for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize();
const int warpIdx = threadIdx.x / Cuda::getWarpSize();
for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks )
auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] );
__syncwarp();
}
if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 )
if( threadInWarpIdx == Cuda::getWarpSize() - 1 )
warpSums[ warpIdx ] = auxData[ threadIdx.x ];
__syncthreads();
......@@ -112,7 +112,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
* Compute prefix-sum of warp sums using one warp
*/
if( warpIdx == 0 )
for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) {
for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) {
if( threadInWarpIdx >= stride )
warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] );
__syncwarp();
......@@ -136,9 +136,9 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
Real chunkShift( zero );
if( chunkIdx > 0 )
chunkShift = auxData[ chunkIdx - 1 ];
sharedData[ Devices::Cuda::getInterleaving( idx ) ] =
reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift );
output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ];
sharedData[ Cuda::getInterleaving( idx ) ] =
reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift );
output[ blockOffset + idx ] = sharedData[ Cuda::getInterleaving( idx ) ];
idx += blockDim.x;
}
__syncthreads();
......@@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType,
{
if( scanType == ScanType::Exclusive )
{
auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ],
sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] );
auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ],
sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] );
}
else
auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ];
auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ];
}
}
......@@ -245,7 +245,7 @@ struct CudaScanKernelLauncher
// compute the number of grids
const int elementsInBlock = 8 * blockSize;
const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
//std::cerr << "numberOfgrids = " << numberOfGrids << std::endl;
// allocate array for the block sums
......@@ -268,8 +268,8 @@ struct CudaScanKernelLauncher
// run the kernel
const std::size_t sharedDataSize = elementsInBlock +
elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2;
const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real );
elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2;
const std::size_t sharedMemory = ( sharedDataSize + blockSize + Cuda::getWarpSize() ) * sizeof( Real );
cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>>
( scanType,
reduction,
......@@ -330,7 +330,7 @@ struct CudaScanKernelLauncher
// compute the number of grids
const int elementsInBlock = 8 * blockSize;
const Index numberOfBlocks = roundUpDivision( size, elementsInBlock );
const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() );
// loop over all grids
for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) {
......@@ -369,13 +369,13 @@ struct CudaScanKernelLauncher
*/
static int& maxGridSize()
{
static int maxGridSize = Devices::Cuda::getMaxGridSize();
static int maxGridSize = Cuda::getMaxGridSize();
return maxGridSize;
}
static void resetMaxGridSize()
{
maxGridSize() = Devices::Cuda::getMaxGridSize();
maxGridSize() = Cuda::getMaxGridSize();
}
static int& gridsCount()
......
......@@ -13,7 +13,7 @@
#include <type_traits>
#include <ostream>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CudaCallable.h>
namespace TNL {
namespace Containers {
......
......@@ -13,7 +13,7 @@
#include <type_traits>
#include <ostream>
#include <TNL/Devices/Cuda.h>
#include <TNL/Cuda/CudaCallable.h>
namespace TNL {
namespace Containers {
......
......@@ -13,7 +13,7 @@
#pragma once
#include <TNL/Assert.h>
#include <TNL/Devices/CudaCallable.h>
#include <TNL/Cuda/CudaCallable.h>