diff --git a/src/Benchmarks/BLAS/spmv.h b/src/Benchmarks/BLAS/spmv.h index 9fe469509b7cb4aa4f2ca6a0c32251cf370b266c..d515d52d73d513d87b86d4b743d8b0e27b20e0ca 100644 --- a/src/Benchmarks/BLAS/spmv.h +++ b/src/Benchmarks/BLAS/spmv.h @@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix, const int elementsPerRow, const int gridIdx ) { - const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= matrix->getRows() ) return; int col = rowIdx - elementsPerRow / 2; @@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix, typedef typename Matrix::IndexType IndexType; typedef typename Matrix::RealType RealType; Pointers::DevicePointer< Matrix > kernel_matrix( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); setCudaTestMatrixKernel< Matrix > <<< cudaGridSize, cudaBlockSize >>> ( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx ); diff --git a/src/Benchmarks/Benchmarks.h b/src/Benchmarks/Benchmarks.h index 683a18376276c4b0dbd194329226e9a517a4af12..3822fef28eec53e47a22c566a34c651b248daab5 100644 --- a/src/Benchmarks/Benchmarks.h +++ b/src/Benchmarks/Benchmarks.h @@ -24,7 +24,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/SystemInfo.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> #include <TNL/Config/ConfigDescription.h> #include <TNL/Communicators/MpiCommunicator.h> @@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata() + convertToString( cacheSizes.L2 ) + ", " + convertToString( cacheSizes.L3 ); #ifdef HAVE_CUDA - const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice(); - const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + - convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) ); + const int activeGPU = Cuda::DeviceInfo::getActiveDevice(); + const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." + + convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) ); #endif Benchmark::MetadataMap metadata { { "host name", Devices::SystemInfo::getHostname() }, @@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata() { "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) }, { "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo }, #ifdef HAVE_CUDA - { "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) }, + { "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) }, { "GPU architecture", deviceArch }, - { "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) }, - { "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) }, - { "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) }, - { "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) }, - { "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) }, + { "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) }, + { "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) }, + { "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) }, + { "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) }, + { "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) }, #endif }; diff --git a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h index 53cd0ec362a028b6ab5f30ce55cf489d4eb5c9f0..e3f47292306bbb879bf2677223f2f4b63cc7513c 100644 --- a/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h +++ b/src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h @@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters, if( std::is_same< DeviceType, Devices::Cuda >::value ) { - this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer ); - this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer ); - this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer ); + this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer ); + this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer ); + this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer ); } this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer ); this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer ); @@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid, { typename GridType::CoordinatesType coordinates; - coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; if( coordinates.x() < end.x() && coordinates.y() < end.y() ) @@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid, typedef typename GridType::IndexType IndexType; typedef typename GridType::RealType RealType; - coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; - coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; + coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; MeshFunction& u = *_u; MeshFunction& fu = *_fu; @@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time, CellType cell( mesh.template getData< DeviceType >() ); dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); - const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y ); + cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y ); + const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x ); + const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y ); //std::cerr << "Setting boundary conditions..." << std::endl; @@ -762,10 +762,10 @@ template< typename Mesh, HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >:: ~HeatEquationBenchmarkProblem() { - if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh ); - if( this->cudaBoundaryConditions ) Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions ); - if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide ); - if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator ); + if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh ); + if( this->cudaBoundaryConditions ) Cuda::freeFromDevice( this->cudaBoundaryConditions ); + if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide ); + if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator ); } diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h index cdbc4922ca07eda7da0ba442340705f6646d8430..7e7e5369182ebc579ab98da55bbacef872284edf 100644 --- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h +++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser.h @@ -12,7 +12,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Pointers/SharedPointer.h> -#include <TNL/CudaStreamPool.h> +#include <TNL/Cuda/StreamPool.h> namespace TNL { diff --git a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h index 816ee5e2c4724137118d221a7487f75651bbc8ac..2a77f8bb5287a75cee1b38356c96dd49dbff5250 100644 --- a/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h +++ b/src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h @@ -126,8 +126,8 @@ _GridTraverser2D( typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx ); if( coordinates <= end ) { @@ -173,7 +173,7 @@ _GridTraverser2DBoundary( Index entitiesAlongX = endX - beginX + 1; Index entitiesAlongY = endY - beginY; - Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx ); if( threadId < entitiesAlongX ) { GridEntity entity( *grid, @@ -244,12 +244,12 @@ processEntities( dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCount, cudaGridsCount; IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); dim3 gridIdx, cudaGridSize; Devices::Cuda::synchronizeDevice(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); _GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -266,11 +266,11 @@ processEntities( { dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1 ); - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); @@ -278,8 +278,8 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); + Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), diff --git a/src/Benchmarks/ODESolvers/Euler.hpp b/src/Benchmarks/ODESolvers/Euler.hpp index 22c0130419123500981c4b343e3b53e2186779bc..ab975ed078c470f4824d18e7848033e6fed73f2c 100644 --- a/src/Benchmarks/ODESolvers/Euler.hpp +++ b/src/Benchmarks/ODESolvers/Euler.hpp @@ -176,10 +176,10 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, { #ifdef HAVE_CUDA dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; + const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); + const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks ); + this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) ); + const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x; localResidue = 0.0; for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) @@ -187,7 +187,7 @@ void Euler< Problem, SolverMonitor >::computeNewTimeLevel( DofVectorPointer& u, const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ); const IndexType gridOffset = gridIdx * threadsPerGrid; const IndexType currentSize = min( size - gridOffset, threadsPerGrid ); - const IndexType currentGridSize = Devices::Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x ); + const IndexType currentGridSize = Cuda::getNumberOfBlocks( currentSize, cudaBlockSize.x ); updateUEuler<<< currentGridSize, cudaBlockSize, sharedMemory >>>( currentSize, tau, diff --git a/src/Benchmarks/ODESolvers/Merson.hpp b/src/Benchmarks/ODESolvers/Merson.hpp index 1ea606c4c233e17b88bd6ba55ef0cfc593e91859..3c74bdf480c66025f44de201e18ad5368d050e6f 100644 --- a/src/Benchmarks/ODESolvers/Merson.hpp +++ b/src/Benchmarks/ODESolvers/Merson.hpp @@ -290,10 +290,10 @@ void Merson< Problem, SolverMonitor >::computeKFunctions( DofVectorPointer& u, { #ifdef HAVE_CUDA dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; + const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); + const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks ); + this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) ); + const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x; this->problem->getExplicitUpdate( time, tau, u, k1 ); cudaDeviceSynchronize(); @@ -384,10 +384,10 @@ typename Problem :: RealType Merson< Problem, SolverMonitor >::computeError( con { #ifdef HAVE_CUDA dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; + const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); + const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks ); + this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) ); + const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x; for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) { @@ -439,10 +439,10 @@ void Merson< Problem, SolverMonitor >::computeNewTimeLevel( const RealType time, { #ifdef HAVE_CUDA dim3 cudaBlockSize( 512 ); - const IndexType cudaBlocks = Devices::Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); - const IndexType cudaGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks ); - this->cudaBlockResidue.setSize( min( cudaBlocks, Devices::Cuda::getMaxGridSize() ) ); - const IndexType threadsPerGrid = Devices::Cuda::getMaxGridSize() * cudaBlockSize.x; + const IndexType cudaBlocks = Cuda::getNumberOfBlocks( size, cudaBlockSize.x ); + const IndexType cudaGrids = Cuda::getNumberOfGrids( cudaBlocks ); + this->cudaBlockResidue.setSize( min( cudaBlocks, Cuda::getMaxGridSize() ) ); + const IndexType threadsPerGrid = Cuda::getMaxGridSize() * cudaBlockSize.x; localResidue = 0.0; for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx ++ ) diff --git a/src/Examples/simple-examples/large-meshfunction-example.h b/src/Examples/simple-examples/large-meshfunction-example.h index 2f9c70b859606f6f02b3689e9cd74e6e87edd7ae..d5520b69e1af3ce54952d49c463c9b85d345903a 100644 --- a/src/Examples/simple-examples/large-meshfunction-example.h +++ b/src/Examples/simple-examples/large-meshfunction-example.h @@ -10,7 +10,6 @@ using namespace TNL; using namespace TNL::Containers; using namespace TNL::Meshes; using namespace TNL::Functions; -using namespace TNL::Devices; int main(int argc, char ** argv) { @@ -28,9 +27,9 @@ int main(int argc, char ** argv) time.start(); #ifdef HAVE_CUDA - using Device=Cuda; + using Device=Devices::Cuda; #else - using Device=Host; + using Device=Devices::Host; #endif using MeshType= Grid<2, double,Device,int>; diff --git a/src/TNL/Allocators/Cuda.h b/src/TNL/Allocators/Cuda.h index 74ebb840432136d9033a17a86684607098a80d86..1b648f1ce3818978e086f26e64128536f40a8806 100644 --- a/src/TNL/Allocators/Cuda.h +++ b/src/TNL/Allocators/Cuda.h @@ -12,7 +12,9 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Exceptions/CudaBadAlloc.h> +#include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Cuda/CheckDevice.h> namespace TNL { namespace Allocators { diff --git a/src/TNL/Allocators/CudaHost.h b/src/TNL/Allocators/CudaHost.h index 284c91fe9b8dbc7abe8e3d4685ef1d7551d19a89..9047e0b9af632b9f6fd466352d2cd3659f67210a 100644 --- a/src/TNL/Allocators/CudaHost.h +++ b/src/TNL/Allocators/CudaHost.h @@ -12,7 +12,9 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Exceptions/CudaBadAlloc.h> +#include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Cuda/CheckDevice.h> namespace TNL { namespace Allocators { diff --git a/src/TNL/Allocators/CudaManaged.h b/src/TNL/Allocators/CudaManaged.h index db29f86cb618bf79e4f1c0fa0ac1ad2750d476bc..bb878ca66bef97491c6db407128c7c3322fdce7a 100644 --- a/src/TNL/Allocators/CudaManaged.h +++ b/src/TNL/Allocators/CudaManaged.h @@ -12,7 +12,9 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Exceptions/CudaBadAlloc.h> +#include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Cuda/CheckDevice.h> namespace TNL { namespace Allocators { diff --git a/src/TNL/Assert.h b/src/TNL/Assert.h index 3d91c8c763b8c9532713f90beee3a02fd2c64b6a..df862956219ac03e1d1e1fa27e1c67a0e1035ad5 100644 --- a/src/TNL/Assert.h +++ b/src/TNL/Assert.h @@ -120,7 +120,7 @@ #include <iostream> #include <stdio.h> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Debugging/StackBacktrace.h> namespace TNL { diff --git a/src/TNL/Communicators/MpiCommunicator.h b/src/TNL/Communicators/MpiCommunicator.h index 926fa329a6e88b3d84406f464aa3e82057e3ef24..0aa14a9ece5d518ce5e142898cea56593d375189 100644 --- a/src/TNL/Communicators/MpiCommunicator.h +++ b/src/TNL/Communicators/MpiCommunicator.h @@ -24,7 +24,7 @@ #include <unistd.h> // getpid #ifdef HAVE_CUDA - #include <TNL/Devices/Cuda.h> + #include <TNL/Cuda/CheckDevice.h> typedef struct __attribute__((__packed__)) { char name[MPI_MAX_PROCESSOR_NAME]; diff --git a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h index e67c11b419dca3ae83706d43c56f5f282aba4beb..c97e0a8aa8f12354b9dbe84e2858fafe235881b2 100644 --- a/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaMultireductionKernel.h @@ -14,7 +14,8 @@ #include <TNL/Assert.h> #include <TNL/Math.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> +#include <TNL/Cuda/SharedMemory.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Exceptions/CudaSupportMissing.h> @@ -52,7 +53,7 @@ CudaMultireductionKernel( const Result zero, const int n, Result* output ) { - Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + Result* sdata = Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.y * blockDim.x + threadIdx.x; @@ -160,10 +161,10 @@ CudaMultireductionKernelLauncher( const Result zero, // where blocksPerMultiprocessor is determined according to the number of // available registers on the multiprocessor. // On Tesla K40c, desGridSize = 8 * 15 = 120. - const int activeDevice = Devices::CudaDeviceInfo::getActiveDevice(); - const int blocksdPerMultiprocessor = Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) + const int activeDevice = Cuda::DeviceInfo::getActiveDevice(); + const int blocksdPerMultiprocessor = Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Multireduction_maxThreadsPerBlock * Multireduction_registersPerThread ); - const int desGridSizeX = blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ); + const int desGridSizeX = blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ); dim3 blockSize, gridSize; // version A: max 16 rows of threads @@ -189,10 +190,10 @@ CudaMultireductionKernelLauncher( const Result zero, while( blockSize.x * blockSize.y > Multireduction_maxThreadsPerBlock ) blockSize.x /= 2; - gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX ); - gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); + gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSizeX ); + gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y ); - if( gridSize.y > (unsigned) Devices::Cuda::getMaxGridSize() ) { + if( gridSize.y > (unsigned) Cuda::getMaxGridSize() ) { std::cerr << "Maximum gridSize.y limit exceeded (limit is 65535, attempted " << gridSize.y << ")." << std::endl; throw 1; } diff --git a/src/TNL/Containers/Algorithms/CudaReductionBuffer.h b/src/TNL/Containers/Algorithms/CudaReductionBuffer.h index 2897c7280a6bc61f9b60a9cb3c7b44a94ad20de3..f873d7815157d9703364622f5d814a2a516af6bf 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionBuffer.h +++ b/src/TNL/Containers/Algorithms/CudaReductionBuffer.h @@ -14,7 +14,7 @@ #include <stdlib.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CheckDevice.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Exceptions/CudaSupportMissing.h> diff --git a/src/TNL/Containers/Algorithms/CudaReductionKernel.h b/src/TNL/Containers/Algorithms/CudaReductionKernel.h index 36bd5c88b5794c18f1bb9688cee9690c115e50d4..3e948a90673069cae11c336a7c5e75e9e13d5807 100644 --- a/src/TNL/Containers/Algorithms/CudaReductionKernel.h +++ b/src/TNL/Containers/Algorithms/CudaReductionKernel.h @@ -14,7 +14,8 @@ #include <TNL/Assert.h> #include <TNL/Math.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> +#include <TNL/Cuda/SharedMemory.h> #include <TNL/Containers/Algorithms/CudaReductionBuffer.h> #include <TNL/Containers/Algorithms/ArrayOperations.h> #include <TNL/Exceptions/CudaSupportMissing.h> @@ -52,7 +53,7 @@ CudaReductionKernel( const Result zero, const Index size, Result* output ) { - Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + Result* sdata = Cuda::getSharedMemory< Result >(); // Get the thread id (tid), global thread id (gid) and gridSize. const Index tid = threadIdx.x; @@ -147,7 +148,7 @@ CudaReductionWithArgumentKernel( const Result zero, Index* idxOutput, const Index* idxInput = nullptr ) { - Result* sdata = Devices::Cuda::getSharedMemory< Result >(); + Result* sdata = Cuda::getSharedMemory< Result >(); Index* sidx = reinterpret_cast< Index* >( &sdata[ blockDim.x ] ); // Get the thread id (tid), global thread id (gid) and gridSize. @@ -282,11 +283,11 @@ struct CudaReductionKernelLauncher // It seems to be better to map only one CUDA block per one multiprocessor or maybe // just slightly more. Therefore we omit blocksdPerMultiprocessor in the following. CudaReductionKernelLauncher( const Index size ) - : activeDevice( Devices::CudaDeviceInfo::getActiveDevice() ), - blocksdPerMultiprocessor( Devices::CudaDeviceInfo::getRegistersPerMultiprocessor( activeDevice ) + : activeDevice( Cuda::DeviceInfo::getActiveDevice() ), + blocksdPerMultiprocessor( Cuda::DeviceInfo::getRegistersPerMultiprocessor( activeDevice ) / ( Reduction_maxThreadsPerBlock * Reduction_registersPerThread ) ), - //desGridSize( blocksdPerMultiprocessor * Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), - desGridSize( Devices::CudaDeviceInfo::getCudaMultiprocessors( activeDevice ) ), + //desGridSize( blocksdPerMultiprocessor * Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ), + desGridSize( Cuda::DeviceInfo::getCudaMultiprocessors( activeDevice ) ), originalSize( size ) { } @@ -402,7 +403,7 @@ struct CudaReductionKernelLauncher #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; - gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); + gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds @@ -510,7 +511,7 @@ struct CudaReductionKernelLauncher #ifdef HAVE_CUDA dim3 blockSize, gridSize; blockSize.x = Reduction_maxThreadsPerBlock; - gridSize.x = TNL::min( Devices::Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); + gridSize.x = TNL::min( Cuda::getNumberOfBlocks( size, blockSize.x ), desGridSize ); // when there is only one warp per blockSize.x, we need to allocate two warps // worth of shared memory so that we don't index shared memory out of bounds diff --git a/src/TNL/Containers/Algorithms/CudaScanKernel.h b/src/TNL/Containers/Algorithms/CudaScanKernel.h index a8c3548757668df966b094a9da19e37b88ab7ed8..5b20164390b53d38034cc5b4b278eff417c2142d 100644 --- a/src/TNL/Containers/Algorithms/CudaScanKernel.h +++ b/src/TNL/Containers/Algorithms/CudaScanKernel.h @@ -13,7 +13,7 @@ #include <iostream> #include <TNL/Math.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/SharedMemory.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Containers/Array.h> @@ -36,8 +36,8 @@ cudaFirstPhaseBlockScan( const ScanType scanType, Real* output, Real* auxArray ) { - Real* sharedData = TNL::Devices::Cuda::getSharedMemory< Real >(); - Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2 ]; + Real* sharedData = TNL::Cuda::getSharedMemory< Real >(); + Real* auxData = &sharedData[ elementsInBlock + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2 ]; Real* warpSums = &auxData[ blockDim.x ]; const Index lastElementIdx = size - blockIdx.x * elementsInBlock; @@ -54,7 +54,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType, sharedData[ 0 ] = zero; while( idx < elementsInBlock && blockOffset + idx < size ) { - sharedData[ Devices::Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; + sharedData[ Cuda::getInterleaving( idx + 1 ) ] = input[ blockOffset + idx ]; idx += blockDim.x; } } @@ -62,7 +62,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType, { while( idx < elementsInBlock && blockOffset + idx < size ) { - sharedData[ Devices::Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ]; + sharedData[ Cuda::getInterleaving( idx ) ] = input[ blockOffset + idx ]; idx += blockDim.x; } } @@ -78,33 +78,33 @@ cudaFirstPhaseBlockScan( const ScanType scanType, if( chunkOffset < lastElementInBlock ) { auxData[ threadIdx.x ] = - sharedData[ Devices::Cuda::getInterleaving( chunkOffset ) ]; + sharedData[ Cuda::getInterleaving( chunkOffset ) ]; } int chunkPointer = 1; while( chunkPointer < chunkSize && chunkOffset + chunkPointer < lastElementInBlock ) { - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = - reduction( sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ], - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); + sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ] = + reduction( sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ], + sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer - 1 ) ] ); auxData[ threadIdx.x ] = - sharedData[ Devices::Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; + sharedData[ Cuda::getInterleaving( chunkOffset + chunkPointer ) ]; chunkPointer++; } /*** * Perform the parallel prefix-sum inside warps. */ - const int threadInWarpIdx = threadIdx.x % Devices::Cuda::getWarpSize(); - const int warpIdx = threadIdx.x / Devices::Cuda::getWarpSize(); - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { + const int threadInWarpIdx = threadIdx.x % Cuda::getWarpSize(); + const int warpIdx = threadIdx.x / Cuda::getWarpSize(); + for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride && threadIdx.x < numberOfChunks ) auxData[ threadIdx.x ] = reduction( auxData[ threadIdx.x ], auxData[ threadIdx.x - stride ] ); __syncwarp(); } - if( threadInWarpIdx == Devices::Cuda::getWarpSize() - 1 ) + if( threadInWarpIdx == Cuda::getWarpSize() - 1 ) warpSums[ warpIdx ] = auxData[ threadIdx.x ]; __syncthreads(); @@ -112,7 +112,7 @@ cudaFirstPhaseBlockScan( const ScanType scanType, * Compute prefix-sum of warp sums using one warp */ if( warpIdx == 0 ) - for( int stride = 1; stride < Devices::Cuda::getWarpSize(); stride *= 2 ) { + for( int stride = 1; stride < Cuda::getWarpSize(); stride *= 2 ) { if( threadInWarpIdx >= stride ) warpSums[ threadIdx.x ] = reduction( warpSums[ threadIdx.x ], warpSums[ threadIdx.x - stride ] ); __syncwarp(); @@ -136,9 +136,9 @@ cudaFirstPhaseBlockScan( const ScanType scanType, Real chunkShift( zero ); if( chunkIdx > 0 ) chunkShift = auxData[ chunkIdx - 1 ]; - sharedData[ Devices::Cuda::getInterleaving( idx ) ] = - reduction( sharedData[ Devices::Cuda::getInterleaving( idx ) ], chunkShift ); - output[ blockOffset + idx ] = sharedData[ Devices::Cuda::getInterleaving( idx ) ]; + sharedData[ Cuda::getInterleaving( idx ) ] = + reduction( sharedData[ Cuda::getInterleaving( idx ) ], chunkShift ); + output[ blockOffset + idx ] = sharedData[ Cuda::getInterleaving( idx ) ]; idx += blockDim.x; } __syncthreads(); @@ -147,11 +147,11 @@ cudaFirstPhaseBlockScan( const ScanType scanType, { if( scanType == ScanType::Exclusive ) { - auxArray[ blockIdx.x ] = reduction( sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ], - sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock ) ] ); + auxArray[ blockIdx.x ] = reduction( sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ], + sharedData[ Cuda::getInterleaving( lastElementInBlock ) ] ); } else - auxArray[ blockIdx.x ] = sharedData[ Devices::Cuda::getInterleaving( lastElementInBlock - 1 ) ]; + auxArray[ blockIdx.x ] = sharedData[ Cuda::getInterleaving( lastElementInBlock - 1 ) ]; } } @@ -245,7 +245,7 @@ struct CudaScanKernelLauncher // compute the number of grids const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); + const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); //std::cerr << "numberOfgrids = " << numberOfGrids << std::endl; // allocate array for the block sums @@ -268,8 +268,8 @@ struct CudaScanKernelLauncher // run the kernel const std::size_t sharedDataSize = elementsInBlock + - elementsInBlock / Devices::Cuda::getNumberOfSharedMemoryBanks() + 2; - const std::size_t sharedMemory = ( sharedDataSize + blockSize + Devices::Cuda::getWarpSize() ) * sizeof( Real ); + elementsInBlock / Cuda::getNumberOfSharedMemoryBanks() + 2; + const std::size_t sharedMemory = ( sharedDataSize + blockSize + Cuda::getWarpSize() ) * sizeof( Real ); cudaFirstPhaseBlockScan<<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( scanType, reduction, @@ -330,7 +330,7 @@ struct CudaScanKernelLauncher // compute the number of grids const int elementsInBlock = 8 * blockSize; const Index numberOfBlocks = roundUpDivision( size, elementsInBlock ); - const Index numberOfGrids = Devices::Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); + const Index numberOfGrids = Cuda::getNumberOfGrids( numberOfBlocks, maxGridSize() ); // loop over all grids for( Index gridIdx = 0; gridIdx < numberOfGrids; gridIdx++ ) { @@ -369,13 +369,13 @@ struct CudaScanKernelLauncher */ static int& maxGridSize() { - static int maxGridSize = Devices::Cuda::getMaxGridSize(); + static int maxGridSize = Cuda::getMaxGridSize(); return maxGridSize; } static void resetMaxGridSize() { - maxGridSize() = Devices::Cuda::getMaxGridSize(); + maxGridSize() = Cuda::getMaxGridSize(); } static int& gridsCount() diff --git a/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h b/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h index fe7a0fb380230909be094042f69cf3ddabd24522..9be47980d1dbef78af8891ff50837d70fb851c22 100644 --- a/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h +++ b/src/TNL/Containers/Multimaps/EllpackIndexMultimapValues.h @@ -13,7 +13,7 @@ #include <type_traits> #include <ostream> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h b/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h index 95ffade9fcad3674a7092bc69701e2a2500ab819..efae4f05173b9f0531cc12e96dd5644a5c72fefe 100644 --- a/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h +++ b/src/TNL/Containers/Multimaps/StaticEllpackIndexMultimapValues.h @@ -13,7 +13,7 @@ #include <type_traits> #include <ostream> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { namespace Containers { diff --git a/src/TNL/Containers/ndarray/SizesHolder.h b/src/TNL/Containers/ndarray/SizesHolder.h index c3334e19b2c4e1ff5db6706fc25e264106fea691..72d61bf8123349a7d2807d4dd877b01f109d48a1 100644 --- a/src/TNL/Containers/ndarray/SizesHolder.h +++ b/src/TNL/Containers/ndarray/SizesHolder.h @@ -13,7 +13,7 @@ #pragma once #include <TNL/Assert.h> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/TemplateStaticFor.h> #include <TNL/Containers/ndarray/Meta.h> diff --git a/src/TNL/Cuda/CheckDevice.h b/src/TNL/Cuda/CheckDevice.h new file mode 100644 index 0000000000000000000000000000000000000000..c857d8dd6ab8129fd2b1cac4e967831207296153 --- /dev/null +++ b/src/TNL/Cuda/CheckDevice.h @@ -0,0 +1,40 @@ +/*************************************************************************** + CheckDevice.h - description + ------------------- + begin : Aug 18, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Exceptions/CudaRuntimeError.h> + +namespace TNL { +namespace Cuda { + +#ifdef HAVE_CUDA + /**** + * I do not know why, but it is more reliable to pass the error code instead + * of calling cudaGetLastError() inside the function. + * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow. + */ + inline void checkDevice( const char* file_name, int line, cudaError error ) + { + if( error != cudaSuccess ) + throw Exceptions::CudaRuntimeError( error, file_name, line ); + } +#else + inline void checkDevice() {} +#endif + +} // namespace Cuda +} // namespace TNL + +#ifdef HAVE_CUDA +#define TNL_CHECK_CUDA_DEVICE ::TNL::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() ) +#else +#define TNL_CHECK_CUDA_DEVICE ::TNL::Cuda::checkDevice() +#endif diff --git a/src/TNL/Devices/CudaCallable.h b/src/TNL/Cuda/CudaCallable.h similarity index 88% rename from src/TNL/Devices/CudaCallable.h rename to src/TNL/Cuda/CudaCallable.h index f63e4e430e01e76df7833001a651e4f220e9bab6..5cd3e8fbbe51abe0bd7dc525c165990b734ef388 100644 --- a/src/TNL/Devices/CudaCallable.h +++ b/src/TNL/Cuda/CudaCallable.h @@ -12,9 +12,6 @@ // The __cuda_callable__ macro has to be in a separate header file to avoid // infinite loops by the #include directives. -// -// For example, the implementation of Devices::Cuda needs TNL_ASSERT_* -// macros, which need __cuda_callable__ functions. /*** * This macro serves for definition of function which are supposed to be called diff --git a/src/TNL/Cuda/DeviceInfo.h b/src/TNL/Cuda/DeviceInfo.h new file mode 100644 index 0000000000000000000000000000000000000000..d53b46fecbf45c49d7c9d6723423c6951a456fef --- /dev/null +++ b/src/TNL/Cuda/DeviceInfo.h @@ -0,0 +1,52 @@ +/*************************************************************************** + CudaDeviceInfo.h - description + ------------------- + begin : Jun 21, 2015 + copyright : (C) 2007 by Tomas Oberhuber + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/String.h> + +namespace TNL { +namespace Cuda { + +struct DeviceInfo +{ + static int getNumberOfDevices(); + + static int getActiveDevice(); + + static String getDeviceName( int deviceNum ); + + static int getArchitectureMajor( int deviceNum ); + + static int getArchitectureMinor( int deviceNum ); + + static int getClockRate( int deviceNum ); + + static std::size_t getGlobalMemory( int deviceNum ); + + static std::size_t getFreeGlobalMemory(); + + static int getMemoryClockRate( int deviceNum ); + + static bool getECCEnabled( int deviceNum ); + + static int getCudaMultiprocessors( int deviceNum ); + + static int getCudaCoresPerMultiprocessors( int deviceNum ); + + static int getCudaCores( int deviceNum ); + + static int getRegistersPerMultiprocessor( int deviceNum ); +}; + +} // namespace Cuda +} // namespace TNL + +#include <TNL/Cuda/DeviceInfo.hpp> diff --git a/src/TNL/Devices/CudaDeviceInfo_impl.h b/src/TNL/Cuda/DeviceInfo.hpp similarity index 86% rename from src/TNL/Devices/CudaDeviceInfo_impl.h rename to src/TNL/Cuda/DeviceInfo.hpp index f29ecd8c91493edb538f11a8ced6c9ee6503983a..d10e6f05cbbb391d06e1287ad6f19db30c967ac0 100644 --- a/src/TNL/Devices/CudaDeviceInfo_impl.h +++ b/src/TNL/Cuda/DeviceInfo.hpp @@ -1,5 +1,5 @@ /*************************************************************************** - CudaDeviceInfo_impl.h - description + DeviceInfo.hpp - description ------------------- begin : Jun 21, 2015 copyright : (C) 2007 by Tomas Oberhuber @@ -12,14 +12,14 @@ #include <unordered_map> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> #include <TNL/Exceptions/CudaSupportMissing.h> namespace TNL { -namespace Devices { +namespace Cuda { inline int -CudaDeviceInfo:: +DeviceInfo:: getNumberOfDevices() { #ifdef HAVE_CUDA @@ -32,7 +32,7 @@ getNumberOfDevices() } inline int -CudaDeviceInfo:: +DeviceInfo:: getActiveDevice() { #ifdef HAVE_CUDA @@ -45,7 +45,7 @@ getActiveDevice() } inline String -CudaDeviceInfo:: +DeviceInfo:: getDeviceName( int deviceNum ) { #ifdef HAVE_CUDA @@ -58,7 +58,7 @@ getDeviceName( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getArchitectureMajor( int deviceNum ) { #ifdef HAVE_CUDA @@ -71,7 +71,7 @@ getArchitectureMajor( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getArchitectureMinor( int deviceNum ) { #ifdef HAVE_CUDA @@ -84,7 +84,7 @@ getArchitectureMinor( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getClockRate( int deviceNum ) { #ifdef HAVE_CUDA @@ -96,8 +96,8 @@ getClockRate( int deviceNum ) #endif } -inline size_t -CudaDeviceInfo:: +inline std::size_t +DeviceInfo:: getGlobalMemory( int deviceNum ) { #ifdef HAVE_CUDA @@ -109,13 +109,13 @@ getGlobalMemory( int deviceNum ) #endif } -inline size_t -CudaDeviceInfo:: +inline std::size_t +DeviceInfo:: getFreeGlobalMemory() { #ifdef HAVE_CUDA - size_t free = 0; - size_t total = 0; + std::size_t free = 0; + std::size_t total = 0; cudaMemGetInfo( &free, &total ); return free; #else @@ -124,7 +124,7 @@ getFreeGlobalMemory() } inline int -CudaDeviceInfo:: +DeviceInfo:: getMemoryClockRate( int deviceNum ) { #ifdef HAVE_CUDA @@ -137,7 +137,7 @@ getMemoryClockRate( int deviceNum ) } inline bool -CudaDeviceInfo:: +DeviceInfo:: getECCEnabled( int deviceNum ) { #ifdef HAVE_CUDA @@ -150,7 +150,7 @@ getECCEnabled( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getCudaMultiprocessors( int deviceNum ) { #ifdef HAVE_CUDA @@ -169,12 +169,12 @@ getCudaMultiprocessors( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getCudaCoresPerMultiprocessors( int deviceNum ) { #ifdef HAVE_CUDA - int major = CudaDeviceInfo::getArchitectureMajor( deviceNum ); - int minor = CudaDeviceInfo::getArchitectureMinor( deviceNum ); + int major = DeviceInfo::getArchitectureMajor( deviceNum ); + int minor = DeviceInfo::getArchitectureMinor( deviceNum ); switch( major ) { case 1: // Tesla generation, G80, G8x, G9x classes @@ -209,19 +209,19 @@ getCudaCoresPerMultiprocessors( int deviceNum ) } inline int -CudaDeviceInfo:: +DeviceInfo:: getCudaCores( int deviceNum ) { #ifdef HAVE_CUDA - return CudaDeviceInfo::getCudaMultiprocessors( deviceNum ) * - CudaDeviceInfo::getCudaCoresPerMultiprocessors( deviceNum ); + return DeviceInfo::getCudaMultiprocessors( deviceNum ) * + DeviceInfo::getCudaCoresPerMultiprocessors( deviceNum ); #else throw Exceptions::CudaSupportMissing(); #endif } inline int -CudaDeviceInfo:: +DeviceInfo:: getRegistersPerMultiprocessor( int deviceNum ) { #ifdef HAVE_CUDA @@ -239,5 +239,5 @@ getRegistersPerMultiprocessor( int deviceNum ) #endif } -} // namespace Devices +} // namespace Cuda } // namespace TNL diff --git a/src/TNL/Cuda/LaunchHelpers.h b/src/TNL/Cuda/LaunchHelpers.h new file mode 100644 index 0000000000000000000000000000000000000000..aaca4a67d8f228e593915b491bb771c8c89921c2 --- /dev/null +++ b/src/TNL/Cuda/LaunchHelpers.h @@ -0,0 +1,162 @@ +/*************************************************************************** + LaunchHelpers.h - description + ------------------- + begin : Aug 19, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <TNL/Math.h> + +namespace TNL { +namespace Cuda { + +inline constexpr int getMaxGridSize() +{ + return 65535; +} + +inline constexpr int getMaxBlockSize() +{ + return 1024; +} + +inline constexpr int getWarpSize() +{ + return 32; +} + +#ifdef HAVE_CUDA +__device__ inline int getGlobalThreadIdx( const int gridIdx = 0, + const int gridSize = getMaxGridSize() ) +{ + return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x; +} + +__device__ inline int getGlobalThreadIdx_x( const dim3& gridIdx ) +{ + return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; +} + +__device__ inline int getGlobalThreadIdx_y( const dim3& gridIdx ) +{ + return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; +} + +__device__ inline int getGlobalThreadIdx_z( const dim3& gridIdx ) +{ + return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; +} +#endif + +inline int getNumberOfBlocks( const int threads, + const int blockSize ) +{ + return roundUpDivision( threads, blockSize ); +} + +inline int getNumberOfGrids( const int blocks, + const int gridSize = getMaxGridSize() ) +{ + return roundUpDivision( blocks, gridSize ); +} + +#ifdef HAVE_CUDA +inline void setupThreads( const dim3& blockSize, + dim3& blocksCount, + dim3& gridsCount, + long long int xThreads, + long long int yThreads = 0, + long long int zThreads = 0 ) +{ + blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) ); + blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) ); + blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) ); + + /**** + * TODO: Fix the following: + * I do not known how to get max grid size in kernels :( + * + * Also, this is very slow. */ + /*int currentDevice( 0 ); + cudaGetDevice( currentDevice ); + cudaDeviceProp properties; + cudaGetDeviceProperties( &properties, currentDevice ); + gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 ); + gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 ); + gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 ); + */ + gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 ); + gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 ); + gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 ); +} + +inline void setupGrid( const dim3& blocksCount, + const dim3& gridsCount, + const dim3& gridIdx, + dim3& gridSize ) +{ + /* TODO: this is ext slow!!!! + int currentDevice( 0 ); + cudaGetDevice( ¤tDevice ); + cudaDeviceProp properties; + cudaGetDeviceProperties( &properties, currentDevice );*/ + + /**** + * TODO: fix the following + if( gridIdx.x < gridsCount.x ) + gridSize.x = properties.maxGridSize[ 0 ]; + else + gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ]; + + if( gridIdx.y < gridsCount.y ) + gridSize.y = properties.maxGridSize[ 1 ]; + else + gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ]; + + if( gridIdx.z < gridsCount.z ) + gridSize.z = properties.maxGridSize[ 2 ]; + else + gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/ + + if( gridIdx.x < gridsCount.x - 1 ) + gridSize.x = getMaxGridSize(); + else + gridSize.x = blocksCount.x % getMaxGridSize(); + + if( gridIdx.y < gridsCount.y - 1 ) + gridSize.y = getMaxGridSize(); + else + gridSize.y = blocksCount.y % getMaxGridSize(); + + if( gridIdx.z < gridsCount.z - 1 ) + gridSize.z = getMaxGridSize(); + else + gridSize.z = blocksCount.z % getMaxGridSize(); +} + +inline std::ostream& operator<<( std::ostream& str, const dim3& d ) +{ + str << "( " << d.x << ", " << d.y << ", " << d.z << " )"; + return str; +} + +inline void printThreadsSetup( const dim3& blockSize, + const dim3& blocksCount, + const dim3& gridSize, + const dim3& gridsCount, + std::ostream& str = std::cout ) +{ + str << "Block size: " << blockSize << std::endl + << " Blocks count: " << blocksCount << std::endl + << " Grid size: " << gridSize << std::endl + << " Grids count: " << gridsCount << std::endl; +} +#endif + +} // namespace Cuda +} // namespace TNL diff --git a/src/TNL/Cuda/MemoryHelpers.h b/src/TNL/Cuda/MemoryHelpers.h new file mode 100644 index 0000000000000000000000000000000000000000..cb214f5d02ebaf6784d08e4c288de6ddd8638de7 --- /dev/null +++ b/src/TNL/Cuda/MemoryHelpers.h @@ -0,0 +1,103 @@ +/*************************************************************************** + MemoryHelpers.h - description + ------------------- + begin : Aug 19, 2019 + copyright : (C) 2019 by Tomas Oberhuber et al. + email : tomas.oberhuber@fjfi.cvut.cz + ***************************************************************************/ + +/* See Copyright Notice in tnl/Copyright */ + +#pragma once + +#include <iostream> + +#include <TNL/Cuda/CheckDevice.h> +#include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Exceptions/CudaBadAlloc.h> + +namespace TNL { +namespace Cuda { + +template< typename ObjectType > +[[deprecated("Allocators and MemoryOperations hould be used instead.")]] +ObjectType* passToDevice( const ObjectType& object ) +{ +#ifdef HAVE_CUDA + ObjectType* deviceObject; + if( cudaMalloc( ( void** ) &deviceObject, + ( size_t ) sizeof( ObjectType ) ) != cudaSuccess ) + throw Exceptions::CudaBadAlloc(); + if( cudaMemcpy( ( void* ) deviceObject, + ( void* ) &object, + sizeof( ObjectType ), + cudaMemcpyHostToDevice ) != cudaSuccess ) + { + TNL_CHECK_CUDA_DEVICE; + cudaFree( ( void* ) deviceObject ); + TNL_CHECK_CUDA_DEVICE; + return 0; + } + return deviceObject; +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename ObjectType > +[[deprecated("Allocators and MemoryOperations hould be used instead.")]] +ObjectType passFromDevice( const ObjectType* object ) +{ +#ifdef HAVE_CUDA + ObjectType aux; + cudaMemcpy( ( void* ) aux, + ( void* ) &object, + sizeof( ObjectType ), + cudaMemcpyDeviceToHost ); + TNL_CHECK_CUDA_DEVICE; + return aux; +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename ObjectType > +[[deprecated("Allocators and MemoryOperations hould be used instead.")]] +void passFromDevice( const ObjectType* deviceObject, + ObjectType& hostObject ) +{ +#ifdef HAVE_CUDA + cudaMemcpy( ( void* ) &hostObject, + ( void* ) deviceObject, + sizeof( ObjectType ), + cudaMemcpyDeviceToHost ); + TNL_CHECK_CUDA_DEVICE; +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename ObjectType > +[[deprecated("Allocators and MemoryOperations hould be used instead.")]] +void freeFromDevice( ObjectType* deviceObject ) +{ +#ifdef HAVE_CUDA + cudaFree( ( void* ) deviceObject ); + TNL_CHECK_CUDA_DEVICE; +#else + throw Exceptions::CudaSupportMissing(); +#endif +} + +template< typename ObjectType > +void print( const ObjectType* deviceObject, std::ostream& str = std::cout ) +{ +#ifdef HAVE_CUDA + ObjectType hostObject; + passFromDevice( deviceObject, hostObject ); + str << hostObject; +#endif +} + +} // namespace Cuda +} // namespace TNL diff --git a/src/TNL/CudaSharedMemory.h b/src/TNL/Cuda/SharedMemory.h similarity index 78% rename from src/TNL/CudaSharedMemory.h rename to src/TNL/Cuda/SharedMemory.h index ec9a43c207fc7962f36ffb40ad0af71973b76868..29851952c01c86356e6872511f90496358b55152 100644 --- a/src/TNL/CudaSharedMemory.h +++ b/src/TNL/Cuda/SharedMemory.h @@ -1,5 +1,5 @@ /*************************************************************************** - CudaSharedMemory.h - description + SharedMemory.h - description ------------------- begin : Oct 18, 2017 copyright : (C) 2017 by Tomas Oberhuber et al. @@ -26,11 +26,11 @@ * * Until CUDA 8.0, it was possible to use reinterpret_cast this way: * - * template< typename Element, size_t Alignment > - * __device__ Element* Cuda::getSharedMemory() + * template< typename T, size_t Alignment > + * __device__ T* getSharedMemory() * { * extern __shared__ __align__ ( Alignment ) unsigned char __sdata[]; - * return reinterpret_cast< Element* >( __sdata ); + * return reinterpret_cast< T* >( __sdata ); * } * * But since CUDA 9.0 there is a new restriction that the alignment of the @@ -44,12 +44,13 @@ #include <stdint.h> namespace TNL { +namespace Cuda { template< typename T, std::size_t _alignment = CHAR_BIT * sizeof(T) > -struct CudaSharedMemory {}; +struct SharedMemory; template< typename T > -struct CudaSharedMemory< T, 8 > +struct SharedMemory< T, 8 > { __device__ inline operator T* () { @@ -65,7 +66,7 @@ struct CudaSharedMemory< T, 8 > }; template< typename T > -struct CudaSharedMemory< T, 16 > +struct SharedMemory< T, 16 > { __device__ inline operator T* () { @@ -81,7 +82,7 @@ struct CudaSharedMemory< T, 16 > }; template< typename T > -struct CudaSharedMemory< T, 32 > +struct SharedMemory< T, 32 > { __device__ inline operator T* () { @@ -97,7 +98,7 @@ struct CudaSharedMemory< T, 32 > }; template< typename T > -struct CudaSharedMemory< T, 64 > +struct SharedMemory< T, 64 > { __device__ inline operator T* () { @@ -112,6 +113,25 @@ struct CudaSharedMemory< T, 64 > } }; +template< typename T > +__device__ inline T* getSharedMemory() +{ + return SharedMemory< T >{}; +} + +// helper functions for indexing shared memory +inline constexpr int getNumberOfSharedMemoryBanks() +{ + return 32; +} + +template< typename Index > +__device__ Index getInterleaving( const Index index ) +{ + return index + index / Cuda::getNumberOfSharedMemoryBanks(); +} + +} // namespace Cuda } // namespace TNL #endif diff --git a/src/TNL/CudaStreamPool.h b/src/TNL/Cuda/StreamPool.h similarity index 73% rename from src/TNL/CudaStreamPool.h rename to src/TNL/Cuda/StreamPool.h index 1dd2b7907fe39b53e331b0147fff1cabe16424ef..59bf38a5791d2ae9f381e831559c5caa7788567a 100644 --- a/src/TNL/CudaStreamPool.h +++ b/src/TNL/Cuda/StreamPool.h @@ -1,5 +1,5 @@ /*************************************************************************** - CudaStreamPool.h - description + StreamPool.h - description ------------------- begin : Oct 14, 2016 copyright : (C) 2016 by Tomas Oberhuber et al. @@ -15,22 +15,20 @@ #include <stdlib.h> #include <unordered_map> -#include <TNL/Devices/Host.h> -#include <TNL/Devices/Cuda.h> - namespace TNL { +namespace Cuda { #ifdef HAVE_CUDA -class CudaStreamPool +class StreamPool { public: // stop the compiler generating methods of copy the object - CudaStreamPool( CudaStreamPool const& copy ) = delete; - CudaStreamPool& operator=( CudaStreamPool const& copy ) = delete; + StreamPool( StreamPool const& copy ) = delete; + StreamPool& operator=( StreamPool const& copy ) = delete; - inline static CudaStreamPool& getInstance() + inline static StreamPool& getInstance() { - static CudaStreamPool instance; + static StreamPool instance; return instance; } @@ -47,14 +45,14 @@ class CudaStreamPool private: // private constructor of the singleton - inline CudaStreamPool() + inline StreamPool() { - atexit( CudaStreamPool::free_atexit ); + atexit( StreamPool::free_atexit ); } inline static void free_atexit( void ) { - CudaStreamPool::getInstance().free(); + StreamPool::getInstance().free(); } protected: @@ -70,5 +68,6 @@ class CudaStreamPool }; #endif +} // namespace Cuda } // namespace TNL diff --git a/src/TNL/Devices/Cuda.h b/src/TNL/Devices/Cuda.h index 122c466c9f98778d3cbaa19a4853710284761495..853cd2e038c669485781a1462cdb4a54a9670e35 100644 --- a/src/TNL/Devices/Cuda.h +++ b/src/TNL/Devices/Cuda.h @@ -16,7 +16,7 @@ #include <TNL/Assert.h> #include <TNL/Pointers/SmartPointersRegister.h> #include <TNL/Timer.h> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Config/ConfigDescription.h> #include <TNL/Config/ParameterContainer.h> @@ -31,131 +31,8 @@ public: static inline bool setup( const Config::ParameterContainer& parameters, const String& prefix = "" ); - __cuda_callable__ static inline constexpr int getMaxGridSize(); - - __cuda_callable__ static inline constexpr int getMaxBlockSize(); - - __cuda_callable__ static inline constexpr int getWarpSize(); - - __cuda_callable__ static inline constexpr int getNumberOfSharedMemoryBanks(); - static inline constexpr int getGPUTransferBufferSize(); -#ifdef HAVE_CUDA - /*** - * This function is obsolete and should be replaced by the following functions. - */ - __device__ static inline int - getGlobalThreadIdx( const int gridIdx = 0, - const int gridSize = getMaxGridSize() ); - - __device__ static inline int - getGlobalThreadIdx_x( const dim3& gridIdx ); - - __device__ static inline int - getGlobalThreadIdx_y( const dim3& gridIdx ); - - __device__ static inline int - getGlobalThreadIdx_z( const dim3& gridIdx ); -#endif - - /**** - * This functions helps to count number of CUDA blocks depending on the - * number of the CUDA threads and the block size. - * It is obsolete and it will be replaced by setupThreads. - */ - static inline int getNumberOfBlocks( const int threads, - const int blockSize ); - - /**** - * This functions helps to count number of CUDA grids depending on the - * number of the CUDA blocks and maximum grid size. - * It is obsolete and it will be replaced by setupThreads. - */ - static inline int getNumberOfGrids( const int blocks, - const int gridSize = getMaxGridSize() ); - -#ifdef HAVE_CUDA - /*! This method sets up gridSize and computes number of grids depending - * on total number of CUDA threads. - */ - static void setupThreads( const dim3& blockSize, - dim3& blocksCount, - dim3& gridsCount, - long long int xThreads, - long long int yThreads = 0, - long long int zThreads = 0 ); - - /*! This method sets up grid size when one iterates over more grids. - * If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum - * allowed by CUDA. Otherwise gridSize.? is set to the size of the grid - * in the last loop i.e. blocksCount.? % maxGridSize.?. - */ - static void setupGrid( const dim3& blocksCount, - const dim3& gridsCount, - const dim3& gridIdx, - dim3& gridSize ); - - static void printThreadsSetup( const dim3& blockSize, - const dim3& blocksCount, - const dim3& gridSize, - const dim3& gridsCount, - std::ostream& str = std::cout ); -#endif - - template< typename ObjectType > - static ObjectType* passToDevice( const ObjectType& object ); - - template< typename ObjectType > - static ObjectType passFromDevice( const ObjectType* object ); - - template< typename ObjectType > - static void passFromDevice( const ObjectType* deviceObject, - ObjectType& hostObject ); - - template< typename ObjectType > - static void freeFromDevice( ObjectType* object ); - - template< typename ObjectType > - static void print( const ObjectType* object, std::ostream& str = std::cout ); - -#ifdef HAVE_CUDA - template< typename Index > - static __device__ Index getInterleaving( const Index index ); - - /**** - * Declaration of variables for dynamic shared memory is difficult in - * templated functions. For example, the following does not work for - * different types T: - * - * template< typename T > - * void foo() - * { - * extern __shared__ T shx[]; - * } - * - * This is because extern variables must be declared exactly once. In - * templated functions we need to have same variable name with different - * type, which causes the conflict. In CUDA samples they solve the problem - * using template specialization via classes, but using one base type and - * reinterpret_cast works too. - * See http://stackoverflow.com/a/19339004/4180822 for reference. - */ - template< typename Element > - static __device__ Element* getSharedMemory(); -#endif - -#ifdef HAVE_CUDA - /**** - * I do not know why, but it is more reliable to pass the error code instead - * of calling cudaGetLastError() inside the method. - * We recommend to use macro 'TNL_CHECK_CUDA_DEVICE' defined bellow. - */ - static inline void checkDevice( const char* file_name, int line, cudaError error ); -#else - static inline void checkDevice() {} -#endif - static inline void insertSmartPointer( Pointers::SmartPointer* pointer ); static inline void removeSmartPointer( Pointers::SmartPointer* pointer ); @@ -180,18 +57,6 @@ public: static inline Pointers::SmartPointersRegister& getSmartPointersRegister(); }; -#ifdef HAVE_CUDA -#define TNL_CHECK_CUDA_DEVICE ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() ) -#else -#define TNL_CHECK_CUDA_DEVICE ::TNL::Devices::Cuda::checkDevice() -#endif - -#ifdef HAVE_CUDA -namespace { - std::ostream& operator << ( std::ostream& str, const dim3& d ); -} -#endif - #ifdef HAVE_CUDA #if __CUDA_ARCH__ < 600 namespace { diff --git a/src/TNL/Devices/CudaDeviceInfo.h b/src/TNL/Devices/CudaDeviceInfo.h deleted file mode 100644 index 9eefe3bad8932670af271204e03f72b5eb501a95..0000000000000000000000000000000000000000 --- a/src/TNL/Devices/CudaDeviceInfo.h +++ /dev/null @@ -1,56 +0,0 @@ -/*************************************************************************** - CudaDeviceInfo.h - description - ------------------- - begin : Jun 21, 2015 - copyright : (C) 2007 by Tomas Oberhuber - email : tomas.oberhuber@fjfi.cvut.cz - ***************************************************************************/ - -/* See Copyright Notice in tnl/Copyright */ - -#pragma once - -#include <stdlib.h> - -#include <TNL/String.h> - -namespace TNL { -namespace Devices { - -class CudaDeviceInfo -{ - public: - - static int getNumberOfDevices(); - - static int getActiveDevice(); - - static String getDeviceName( int deviceNum ); - - static int getArchitectureMajor( int deviceNum ); - - static int getArchitectureMinor( int deviceNum ); - - static int getClockRate( int deviceNum ); - - static size_t getGlobalMemory( int deviceNum ); - - static size_t getFreeGlobalMemory(); - - static int getMemoryClockRate( int deviceNum ); - - static bool getECCEnabled( int deviceNum ); - - static int getCudaMultiprocessors( int deviceNum ); - - static int getCudaCoresPerMultiprocessors( int deviceNum ); - - static int getCudaCores( int deviceNum ); - - static int getRegistersPerMultiprocessor( int deviceNum ); -}; - -} // namespace Devices -} // namespace TNL - -#include <TNL/Devices/CudaDeviceInfo_impl.h> diff --git a/src/TNL/Devices/Cuda_impl.h b/src/TNL/Devices/Cuda_impl.h index b758584bd55773de11b0f85f187bf9e64fc3befc..6d3daa356cb64f2373451a260a1403aabff83be6 100644 --- a/src/TNL/Devices/Cuda_impl.h +++ b/src/TNL/Devices/Cuda_impl.h @@ -12,11 +12,10 @@ #include <TNL/Math.h> #include <TNL/Devices/Cuda.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> #include <TNL/Exceptions/CudaBadAlloc.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/CudaRuntimeError.h> -#include <TNL/CudaSharedMemory.h> namespace TNL { namespace Devices { @@ -49,279 +48,30 @@ Cuda::setup( const Config::ParameterContainer& parameters, return true; } -__cuda_callable__ -inline constexpr int Cuda::getMaxGridSize() -{ - return 65535; -} - -__cuda_callable__ -inline constexpr int Cuda::getMaxBlockSize() -{ - return 1024; -} - -__cuda_callable__ -inline constexpr int Cuda::getWarpSize() -{ - return 32; -} - -__cuda_callable__ -inline constexpr int Cuda::getNumberOfSharedMemoryBanks() -{ - return 32; -} - inline constexpr int Cuda::getGPUTransferBufferSize() { return 1 << 20; } -#ifdef HAVE_CUDA -__device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gridSize ) -{ - return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x; -} - -__device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx ) -{ - return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; -} - -__device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx ) -{ - return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y; -} - -__device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx ) -{ - return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z; -} -#endif - -inline int Cuda::getNumberOfBlocks( const int threads, - const int blockSize ) -{ - return roundUpDivision( threads, blockSize ); -} - -inline int Cuda::getNumberOfGrids( const int blocks, - const int gridSize ) -{ - return roundUpDivision( blocks, gridSize ); -} - -#ifdef HAVE_CUDA -inline void Cuda::setupThreads( const dim3& blockSize, - dim3& blocksCount, - dim3& gridsCount, - long long int xThreads, - long long int yThreads, - long long int zThreads ) -{ - blocksCount.x = max( 1, xThreads / blockSize.x + ( xThreads % blockSize.x != 0 ) ); - blocksCount.y = max( 1, yThreads / blockSize.y + ( yThreads % blockSize.y != 0 ) ); - blocksCount.z = max( 1, zThreads / blockSize.z + ( zThreads % blockSize.z != 0 ) ); - - /**** - * TODO: Fix the following: - * I do not known how to get max grid size in kernels :( - * - * Also, this is very slow. */ - /*int currentDevice( 0 ); - cudaGetDevice( currentDevice ); - cudaDeviceProp properties; - cudaGetDeviceProperties( &properties, currentDevice ); - gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 ); - gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 ); - gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 ); - */ - gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 ); - gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 ); - gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 ); -} - -inline void Cuda::setupGrid( const dim3& blocksCount, - const dim3& gridsCount, - const dim3& gridIdx, - dim3& gridSize ) -{ - /* TODO: this is extremely slow!!!! - int currentDevice( 0 ); - cudaGetDevice( ¤tDevice ); - cudaDeviceProp properties; - cudaGetDeviceProperties( &properties, currentDevice );*/ - - /**** - * TODO: fix the following - if( gridIdx.x < gridsCount.x ) - gridSize.x = properties.maxGridSize[ 0 ]; - else - gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ]; - - if( gridIdx.y < gridsCount.y ) - gridSize.y = properties.maxGridSize[ 1 ]; - else - gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ]; - - if( gridIdx.z < gridsCount.z ) - gridSize.z = properties.maxGridSize[ 2 ]; - else - gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/ - - if( gridIdx.x < gridsCount.x - 1 ) - gridSize.x = getMaxGridSize(); - else - gridSize.x = blocksCount.x % getMaxGridSize(); - - if( gridIdx.y < gridsCount.y - 1 ) - gridSize.y = getMaxGridSize(); - else - gridSize.y = blocksCount.y % getMaxGridSize(); - - if( gridIdx.z < gridsCount.z - 1 ) - gridSize.z = getMaxGridSize(); - else - gridSize.z = blocksCount.z % getMaxGridSize(); -} - -inline void Cuda::printThreadsSetup( const dim3& blockSize, - const dim3& blocksCount, - const dim3& gridSize, - const dim3& gridsCount, - std::ostream& str ) -{ - str << "Block size: " << blockSize << std::endl - << " Blocks count: " << blocksCount << std::endl - << " Grid size: " << gridSize << std::endl - << " Grids count: " << gridsCount << std::endl; -} -#endif - - -template< typename ObjectType > -ObjectType* Cuda::passToDevice( const ObjectType& object ) -{ -#ifdef HAVE_CUDA - ObjectType* deviceObject; - if( cudaMalloc( ( void** ) &deviceObject, - ( size_t ) sizeof( ObjectType ) ) != cudaSuccess ) - throw Exceptions::CudaBadAlloc(); - if( cudaMemcpy( ( void* ) deviceObject, - ( void* ) &object, - sizeof( ObjectType ), - cudaMemcpyHostToDevice ) != cudaSuccess ) - { - TNL_CHECK_CUDA_DEVICE; - cudaFree( ( void* ) deviceObject ); - TNL_CHECK_CUDA_DEVICE; - return 0; - } - return deviceObject; -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -template< typename ObjectType > -ObjectType Cuda::passFromDevice( const ObjectType* object ) -{ -#ifdef HAVE_CUDA - ObjectType aux; - cudaMemcpy( ( void* ) aux, - ( void* ) &object, - sizeof( ObjectType ), - cudaMemcpyDeviceToHost ); - TNL_CHECK_CUDA_DEVICE; - return aux; -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -template< typename ObjectType > -void Cuda::passFromDevice( const ObjectType* deviceObject, - ObjectType& hostObject ) -{ -#ifdef HAVE_CUDA - cudaMemcpy( ( void* ) &hostObject, - ( void* ) deviceObject, - sizeof( ObjectType ), - cudaMemcpyDeviceToHost ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -template< typename ObjectType > -void Cuda::print( const ObjectType* deviceObject, std::ostream& str ) -{ -#ifdef HAVE_CUDA - ObjectType hostObject; - passFromDevice( deviceObject, hostObject ); - str << hostObject; -#endif -} - - -template< typename ObjectType > -void Cuda::freeFromDevice( ObjectType* deviceObject ) -{ -#ifdef HAVE_CUDA - cudaFree( ( void* ) deviceObject ); - TNL_CHECK_CUDA_DEVICE; -#else - throw Exceptions::CudaSupportMissing(); -#endif -} - -#ifdef HAVE_CUDA -template< typename Index > -__device__ Index Cuda::getInterleaving( const Index index ) -{ - return index + index / Cuda::getNumberOfSharedMemoryBanks(); -} - -template< typename Element > -__device__ Element* Cuda::getSharedMemory() -{ - return CudaSharedMemory< Element >(); -} -#endif - -#ifdef HAVE_CUDA -inline void Cuda::checkDevice( const char* file_name, int line, cudaError error ) -{ - if( error != cudaSuccess ) - throw Exceptions::CudaRuntimeError( error, file_name, line ); -} -#endif - inline void Cuda::insertSmartPointer( Pointers::SmartPointer* pointer ) { - getSmartPointersRegister().insert( pointer, Devices::CudaDeviceInfo::getActiveDevice() ); + getSmartPointersRegister().insert( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() ); } inline void Cuda::removeSmartPointer( Pointers::SmartPointer* pointer ) { - getSmartPointersRegister().remove( pointer, Devices::CudaDeviceInfo::getActiveDevice() ); + getSmartPointersRegister().remove( pointer, TNL::Cuda::DeviceInfo::getActiveDevice() ); } inline bool Cuda::synchronizeDevice( int deviceId ) { #ifdef HAVE_CUDA -#ifdef HAVE_CUDA_UNIFIED_MEMORY - return true; -#else if( deviceId < 0 ) - deviceId = Devices::CudaDeviceInfo::getActiveDevice(); + deviceId = TNL::Cuda::DeviceInfo::getActiveDevice(); getSmartPointersSynchronizationTimer().start(); bool b = getSmartPointersRegister().synchronizeDevice( deviceId ); getSmartPointersSynchronizationTimer().stop(); return b; -#endif #else return true; #endif @@ -339,16 +89,6 @@ inline Pointers::SmartPointersRegister& Cuda::getSmartPointersRegister() return reg; } -#ifdef HAVE_CUDA -namespace { - std::ostream& operator << ( std::ostream& str, const dim3& d ) - { - str << "( " << d.x << ", " << d.y << ", " << d.z << " )"; - return str; - } -} -#endif - // double-precision atomicAdd function for Maxwell and older GPUs // copied from: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions #ifdef HAVE_CUDA diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h index 55129c4e1008e69ef3b3d238acb8fc587cce4076..4dac64a23fc50d85186e2a1f118f818807d0fc87 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase1D_impl.h @@ -22,7 +22,7 @@ initInterface( const MeshFunctionPointer& _input, const MeshType& mesh = _input->getMesh(); const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); dim3 blockSize( cudaBlockSize ); dim3 gridSize( numBlocksX ); Devices::Cuda::synchronizeDevice(); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h index cddf4f9cb7a97f8a74eb94f7377b2cf740db03a5..947a4be062a9b8763ec9a6042c41d3048aa8a361 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase2D_impl.h @@ -25,8 +25,8 @@ initInterface( const MeshFunctionPointer& _input, const MeshType& mesh = _input->getMesh(); const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); Devices::Cuda::synchronizeDevice(); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h index 32548abcfe66affd71a79d3ed5f2f21d67df644e..eb0665c7e4a66a8424af4a312d68f0f3a039a934 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlDirectEikonalMethodBase3D_impl.h @@ -23,9 +23,9 @@ initInterface( const MeshFunctionPointer& _input, const MeshType& mesh = _input->getMesh(); const int cudaBlockSize( 8 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh.getDimensions().x(), cudaBlockSize ); + int numBlocksY = Cuda::getNumberOfBlocks( mesh.getDimensions().y(), cudaBlockSize ); + int numBlocksZ = Cuda::getNumberOfBlocks( mesh.getDimensions().z(), cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; dim3 blockSize( cudaBlockSize, cudaBlockSize, cudaBlockSize ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h index f2f033ccbee3ffa5b71567ea1b54e2307ebe1713..52c2ebbee3f16fd39a2dfab738ac9bea6ffaf393 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod1D_impl.h @@ -105,7 +105,7 @@ solve( const MeshPointer& mesh, // TODO: CUDA code #ifdef HAVE_CUDA const int cudaBlockSize( 16 ); - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x(), cudaBlockSize ); dim3 blockSize( cudaBlockSize ); dim3 gridSize( numBlocksX ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h index e5638c11dd71d88d72d8d0590c8e51c0df6baaab..c5a0f74cc53f89808f5308aa628c6c55802eb200 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod2D_impl.h @@ -251,8 +251,8 @@ solve( const MeshPointer& mesh, const int cudaBlockSize( 16 ); // Setting number of threads and blocks for kernel - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); dim3 blockSize( cudaBlockSize, cudaBlockSize ); dim3 gridSize( numBlocksX, numBlocksY ); diff --git a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h index 325b626f7bf5262637f8e1b43ec9e156bbeca26b..3fce5564efdf65ebaf45c62eaed8cae259d1c4b3 100644 --- a/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h +++ b/src/TNL/Experimental/Hamilton-Jacobi/Solvers/hamilton-jacobi/tnlFastSweepingMethod3D_impl.h @@ -263,9 +263,9 @@ solve( const MeshPointer& mesh, const int cudaBlockSize( 8 ); // Getting the number of blocks in grid in each direction (without overlaps bcs we dont calculate on overlaps) - int numBlocksX = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); - int numBlocksY = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); - int numBlocksZ = Devices::Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); + int numBlocksX = Cuda::getNumberOfBlocks( mesh->getDimensions().x() - vecLowerOverlaps[0] - vecUpperOverlaps[0], cudaBlockSize ); + int numBlocksY = Cuda::getNumberOfBlocks( mesh->getDimensions().y() - vecLowerOverlaps[1] - vecUpperOverlaps[1], cudaBlockSize ); + int numBlocksZ = Cuda::getNumberOfBlocks( mesh->getDimensions().z() - vecLowerOverlaps[2] - vecUpperOverlaps[2], cudaBlockSize ); if( cudaBlockSize * cudaBlockSize * cudaBlockSize > 1024 || numBlocksX > 1024 || numBlocksY > 1024 || numBlocksZ > 64 ) std::cout << "Invalid kernel call. Dimensions of grid are max: [1024,1024,64], and maximum threads per block are 1024!" << std::endl; diff --git a/src/TNL/File.hpp b/src/TNL/File.hpp index 19a9eaa06a3a88e264de3b4528730baa0304a897..a3eb66066e2a014fe16c70910db60212aecc8629 100644 --- a/src/TNL/File.hpp +++ b/src/TNL/File.hpp @@ -17,6 +17,7 @@ #include <TNL/File.h> #include <TNL/Assert.h> +#include <TNL/Cuda/CheckDevice.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Exceptions/FileSerializationError.h> #include <TNL/Exceptions/FileDeserializationError.h> diff --git a/src/TNL/Functions/FunctionAdapter.h b/src/TNL/Functions/FunctionAdapter.h index b9c35886689bf254eccfd4154469be71462107d3..b763ee47631d10522038b8dad9f927e83bc37a88 100644 --- a/src/TNL/Functions/FunctionAdapter.h +++ b/src/TNL/Functions/FunctionAdapter.h @@ -10,7 +10,7 @@ #pragma once -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Config/ParameterContainer.h> #include <TNL/Functions/Domain.h> diff --git a/src/TNL/Functions/OperatorFunction.h b/src/TNL/Functions/OperatorFunction.h index 1f1e89b029f5d5e816e6f6df4bc6d9e9d27bb377..cc46d557a10cbfc315d04275d613de0ce679dce0 100644 --- a/src/TNL/Functions/OperatorFunction.h +++ b/src/TNL/Functions/OperatorFunction.h @@ -11,7 +11,7 @@ #pragma once #include <type_traits> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Functions/MeshFunction.h> #include <TNL/Solvers/PDE/BoundaryConditionsSetter.h> diff --git a/src/TNL/Functions/TestFunction_impl.h b/src/TNL/Functions/TestFunction_impl.h index e2bdce1f1c4a72848e82f10d9c270099121c28b7..918f24107d0e4a27a24d414d6cdcbea4f885cb45 100644 --- a/src/TNL/Functions/TestFunction_impl.h +++ b/src/TNL/Functions/TestFunction_impl.h @@ -11,6 +11,8 @@ #pragma once #include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/MemoryHelpers.h> + #include <TNL/Functions/Analytic/Constant.h> #include <TNL/Functions/Analytic/ExpBump.h> #include <TNL/Functions/Analytic/SinBumps.h> @@ -137,7 +139,7 @@ setupFunction( const Config::ParameterContainer& parameters, } if( std::is_same< Device, Devices::Cuda >::value ) { - this->function = Devices::Cuda::passToDevice( *auxFunction ); + this->function = Cuda::passToDevice( *auxFunction ); delete auxFunction; TNL_CHECK_CUDA_DEVICE; } @@ -166,7 +168,7 @@ setupOperator( const Config::ParameterContainer& parameters, } if( std::is_same< Device, Devices::Cuda >::value ) { - this->operator_ = Devices::Cuda::passToDevice( *auxOperator ); + this->operator_ = Cuda::passToDevice( *auxOperator ); delete auxOperator; TNL_CHECK_CUDA_DEVICE; } @@ -736,7 +738,7 @@ deleteFunction() if( std::is_same< Device, Devices::Cuda >::value ) { if( function ) - Devices::Cuda::freeFromDevice( ( FunctionType * ) function ); + Cuda::freeFromDevice( ( FunctionType * ) function ); } } @@ -756,7 +758,7 @@ deleteOperator() if( std::is_same< Device, Devices::Cuda >::value ) { if( operator_ ) - Devices::Cuda::freeFromDevice( ( OperatorType * ) operator_ ); + Cuda::freeFromDevice( ( OperatorType * ) operator_ ); } } @@ -912,7 +914,7 @@ printFunction( std::ostream& str ) const } if( std::is_same< Device, Devices::Cuda >::value ) { - Devices::Cuda::print( f, str ); + Cuda::print( f, str ); return str; } } diff --git a/src/TNL/Logger_impl.h b/src/TNL/Logger_impl.h index 0e1dd8dc62434faf07b64a95f1f896ed9b8af940..6a3da0f9659e8688bca777d76d71b6947c62f99b 100644 --- a/src/TNL/Logger_impl.h +++ b/src/TNL/Logger_impl.h @@ -14,7 +14,7 @@ #include <iomanip> #include <TNL/Logger.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/DeviceInfo.h> #include <TNL/Devices/SystemInfo.h> namespace TNL { @@ -95,19 +95,19 @@ Logger::writeSystemInformation( const Config::ParameterContainer& parameters ) // for( int i = 0; i < devices; i++ ) // { // logger.writeParameter< int >( "Device no.", i, 1 ); - const int i = Devices::CudaDeviceInfo::getActiveDevice(); - writeParameter< String >( "Name", Devices::CudaDeviceInfo::getDeviceName( i ), 2 ); - const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( i ) ) + "." + - convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( i ) ); + const int i = Cuda::DeviceInfo::getActiveDevice(); + writeParameter< String >( "Name", Cuda::DeviceInfo::getDeviceName( i ), 2 ); + const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( i ) ) + "." + + convertToString( Cuda::DeviceInfo::getArchitectureMinor( i ) ); writeParameter< String >( "Architecture", deviceArch, 2 ); - writeParameter< int >( "CUDA cores", Devices::CudaDeviceInfo::getCudaCores( i ), 2 ); - const double clockRate = ( double ) Devices::CudaDeviceInfo::getClockRate( i ) / 1.0e3; + writeParameter< int >( "CUDA cores", Cuda::DeviceInfo::getCudaCores( i ), 2 ); + const double clockRate = ( double ) Cuda::DeviceInfo::getClockRate( i ) / 1.0e3; writeParameter< double >( "Clock rate (in MHz)", clockRate, 2 ); - const double globalMemory = ( double ) Devices::CudaDeviceInfo::getGlobalMemory( i ) / 1.0e9; + const double globalMemory = ( double ) Cuda::DeviceInfo::getGlobalMemory( i ) / 1.0e9; writeParameter< double >( "Global memory (in GB)", globalMemory, 2 ); - const double memoryClockRate = ( double ) Devices::CudaDeviceInfo::getMemoryClockRate( i ) / 1.0e3; + const double memoryClockRate = ( double ) Cuda::DeviceInfo::getMemoryClockRate( i ) / 1.0e3; writeParameter< double >( "Memory clock rate (in Mhz)", memoryClockRate, 2 ); - writeParameter< bool >( "ECC enabled", Devices::CudaDeviceInfo::getECCEnabled( i ), 2 ); + writeParameter< bool >( "ECC enabled", Cuda::DeviceInfo::getECCEnabled( i ), 2 ); // } } return true; diff --git a/src/TNL/Math.h b/src/TNL/Math.h index b7591bf65942b8070ec90f6d8a440cecab4807b6..321cc7ce39e0d0beb8c3c3c2a5ab3ac7bbfddbdd 100644 --- a/src/TNL/Math.h +++ b/src/TNL/Math.h @@ -15,7 +15,7 @@ #include <algorithm> #include <TNL/TypeTraits.h> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { diff --git a/src/TNL/Matrices/AdEllpack_impl.h b/src/TNL/Matrices/AdEllpack_impl.h index e754eca68afe082c0958c4ab3bdcbb7f546bcdde..a0f293b3df94afcfeda8124f8e1d8173cb4c7718 100644 --- a/src/TNL/Matrices/AdEllpack_impl.h +++ b/src/TNL/Matrices/AdEllpack_impl.h @@ -936,14 +936,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda2( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 256; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -984,14 +984,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda4( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 192; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1043,14 +1043,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda8( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1101,14 +1101,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda16( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 128; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1159,14 +1159,14 @@ void AdEllpack< Real, Device, Index >::spmvCuda32( const InVector& inVector, OutVector& outVector, const int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; IndexType warpIdx = globalIdx >> 5; IndexType inWarpIdx = globalIdx & ( this->warpSize - 1 ); if( globalIdx >= this->reduceMap.getSize() ) return; const int blockSize = 96; - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ IndexType reduceMap[ blockSize ]; reduceMap[ threadIdx.x ] = this->reduceMap[ globalIdx ]; temp[ threadIdx.x ] = 0.0; @@ -1292,18 +1292,18 @@ public: { typedef AdEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); if( matrix.totalLoad < 2 ) { - dim3 blockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 blockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = blockSize.x * sizeof( Real ); AdEllpackVectorProductCuda2< Real, Index, InVector, OutVector > <<< cudaGridSize, blockSize, sharedMemory >>> @@ -1313,20 +1313,20 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 4 ) { - dim3 blockSize( 192 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 blockSize( 192 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = blockSize.x * sizeof( Real ); AdEllpackVectorProductCuda4< Real, Index, InVector, OutVector > <<< cudaGridSize, blockSize, sharedMemory >>> @@ -1336,20 +1336,20 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 8 ) { - dim3 blockSize( 128 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = blockSize.x * sizeof( Real ); AdEllpackVectorProductCuda8< Real, Index, InVector, OutVector > <<< cudaGridSize, blockSize, sharedMemory >>> @@ -1359,20 +1359,20 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else if( matrix.totalLoad < 16 ) { - dim3 blockSize( 128 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 blockSize( 128 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = blockSize.x * sizeof( Real ); AdEllpackVectorProductCuda16< Real, Index, InVector, OutVector > <<< cudaGridSize, blockSize, sharedMemory >>> @@ -1382,20 +1382,20 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } else { - dim3 blockSize( 96 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 blockSize( 96 ), cudaGridSize( Cuda::getMaxGridSize() ); IndexType cudaBlocks = roundUpDivision( matrix.reduceMap.getSize(), blockSize.x ); - IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = blockSize.x * sizeof( Real ); AdEllpackVectorProductCuda32< Real, Index, InVector, OutVector > <<< cudaGridSize, blockSize, sharedMemory >>> @@ -1405,9 +1405,9 @@ public: gridIdx ); } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; } } diff --git a/src/TNL/Matrices/BiEllpackSymmetric_impl.h b/src/TNL/Matrices/BiEllpackSymmetric_impl.h index 47b34282822c48d8e7fc384698c164b0036b9fc6..0af180c0e8c2c54d2c4fdb304fa3e2813d76786c 100644 --- a/src/TNL/Matrices/BiEllpackSymmetric_impl.h +++ b/src/TNL/Matrices/BiEllpackSymmetric_impl.h @@ -1053,7 +1053,7 @@ void BiEllpackSymmetric< Real, Device, Index, StripSize >::spmvCuda( const InVec IndexType bisection = this->warpSize; IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ Real results[ cudaBlockSize ]; results[ threadIdx.x ] = 0.0; IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx; @@ -1274,7 +1274,7 @@ void BiEllpackSymmetricVectorProductCuda( const BiEllpackSymmetric< Real, Device int gridIdx, const int warpSize ) { - Index globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; matrix->spmvCuda( *inVector, *outVector, globalIdx ); } #endif @@ -1394,7 +1394,7 @@ void performRowBubbleSortCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, S const typename BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize >::RowLengthsVector* rowLengths, int gridIdx ) { - const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx ); } #endif @@ -1409,7 +1409,7 @@ void computeColumnSizesCuda( BiEllpackSymmetric< Real, Devices::Cuda, Index, Str const Index numberOfStrips, int gridIdx ) { - const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx ); } #endif @@ -1513,23 +1513,23 @@ public: Index numberOfStrips = matrix.virtualRows / StripSize; typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); performRowBubbleSortCuda< Real, Index, StripSize > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_rowLengths, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_rowLengths ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_rowLengths ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -1544,15 +1544,15 @@ public: const Index numberOfStrips = matrix.virtualRows / StripSize; typedef BiEllpackSymmetric< Real, Devices::Cuda, Index, StripSize > Matrix; typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); computeColumnSizesCuda< Real, Index, StripSize > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, @@ -1560,8 +1560,8 @@ public: numberOfStrips, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_rowLengths ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_rowLengths ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -1579,16 +1579,16 @@ public: #ifdef HAVE_CUDA typedef BiEllpackSymmetric< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); BiEllpackSymmetricVectorProductCuda< Real, Index, StripSize, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> @@ -1598,9 +1598,9 @@ public: gridIdx, matrix.warpSize ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/BiEllpack_impl.h b/src/TNL/Matrices/BiEllpack_impl.h index ac02853616f17965458e9e14f8b225718d82354a..6af80899578528bd9d958c8515e3c228db4778d0 100644 --- a/src/TNL/Matrices/BiEllpack_impl.h +++ b/src/TNL/Matrices/BiEllpack_impl.h @@ -1057,7 +1057,7 @@ void BiEllpack< Real, Device, Index, StripSize >::spmvCuda( const InVector& inVe IndexType bisection = this->warpSize; IndexType groupBegin = strip * ( this->logWarpSize + 1 ); - Real* temp = Devices::Cuda::getSharedMemory< Real >(); + Real* temp = Cuda::getSharedMemory< Real >(); __shared__ Real results[ cudaBlockSize ]; results[ threadIdx.x ] = 0.0; IndexType elementPtr = ( this->groupPointers[ groupBegin ] << this->logWarpSize ) + inWarpIdx; @@ -1277,7 +1277,7 @@ void BiEllpackVectorProductCuda( const BiEllpack< Real, Devices::Cuda, Index, St int gridIdx, const int warpSize ) { - Index globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + Index globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; matrix->spmvCuda( *inVector, *outVector, globalIdx ); } #endif @@ -1397,7 +1397,7 @@ void performRowBubbleSortCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize const typename BiEllpack< Real, Devices::Cuda, Index, StripSize >::CompressedRowLengthsVector* rowLengths, int gridIdx ) { - const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->performRowBubbleSortCudaKernel( *rowLengths, stripIdx ); } #endif @@ -1412,7 +1412,7 @@ void computeColumnSizesCuda( BiEllpack< Real, Devices::Cuda, Index, StripSize >* const Index numberOfStrips, int gridIdx ) { - const Index stripIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index stripIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeColumnSizesCudaKernel( *rowLengths, numberOfStrips, stripIdx ); } #endif @@ -1516,23 +1516,23 @@ public: Index numberOfStrips = matrix.virtualRows / StripSize; typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); performRowBubbleSortCuda< Real, Index, StripSize > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_rowLengths, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_rowLengths ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_rowLengths ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -1547,15 +1547,15 @@ public: const Index numberOfStrips = matrix.virtualRows / StripSize; typedef BiEllpack< Real, Devices::Cuda, Index, StripSize > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - CompressedRowLengthsVector* kernel_rowLengths = Devices::Cuda::passToDevice( rowLengths ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + CompressedRowLengthsVector* kernel_rowLengths = Cuda::passToDevice( rowLengths ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfStrips, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); computeColumnSizesCuda< Real, Index, StripSize > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, @@ -1563,8 +1563,8 @@ public: numberOfStrips, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_rowLengths ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_rowLengths ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -1582,16 +1582,16 @@ public: #ifdef HAVE_CUDA typedef BiEllpack< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); BiEllpackVectorProductCuda< Real, Index, StripSize, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> @@ -1601,9 +1601,9 @@ public: gridIdx, matrix.warpSize ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/CSR_impl.h b/src/TNL/Matrices/CSR_impl.h index 1e28157f90dc21b49532dc717a37534df82a2c9d..86b10119c249860cd8469f12ec89e12cd3d70aa9 100644 --- a/src/TNL/Matrices/CSR_impl.h +++ b/src/TNL/Matrices/CSR_impl.h @@ -33,7 +33,7 @@ template< typename Real, typename Index > CSR< Real, Device, Index >::CSR() : spmvCudaKernel( hybrid ), - cudaWarpSize( 32 ), //Devices::Cuda::getWarpSize() ) + cudaWarpSize( 32 ), //Cuda::getWarpSize() ) hybridModeSplit( 4 ) { }; @@ -145,16 +145,16 @@ Index CSR< Real, Device, Index >::getNonZeroRowLength( const IndexType row ) con // // (gdb) p rowPointers.getElement(0) // // Attempt to take address of value not located in memory. // IndexType resultHost ( 0 ); -// IndexType* resultCuda = Devices::Cuda::passToDevice( resultHost ); +// IndexType* resultCuda = Cuda::passToDevice( resultHost ); // // PROBLEM: If the second parameter of getNonZeroRowLengthCudaKernel is '&resultCuda', the following issue is thrown: // // 'error: no instance of function template "TNL::Matrices::getNonZeroRowLengthCudaKernel" matches the argument list' // TNL::Matrices::getNonZeroRowLengthCudaKernel< ConstMatrixRow, IndexType ><<< 1, 1 >>>( matrixRow, resultCuda ); // matrixRow works fine, tested them both separately // delete []cols; // delete []vals; // std::cout << "Checkpoint BEFORE passFromDevice" << std::endl; -// resultHost = Devices::Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE +// resultHost = Cuda::passFromDevice( resultCuda ); // This causes a crash: Illegal memory address in Cuda_impl.h at TNL_CHECK_CUDA_DEVICE // std::cout << "Checkpoint AFTER passFromDevice" << std::endl; -// Devices::Cuda::freeFromDevice( resultCuda ); +// Cuda::freeFromDevice( resultCuda ); // return resultHost; // } } @@ -713,7 +713,7 @@ void CSR< Real, Device, Index >::spmvCudaVectorized( const InVector& inVector, const IndexType warpEnd, const IndexType inWarpIdx ) const { - volatile Real* aux = Devices::Cuda::getSharedMemory< Real >(); + volatile Real* aux = Cuda::getSharedMemory< Real >(); for( IndexType row = warpStart; row < warpEnd; row++ ) { aux[ threadIdx.x ] = 0.0; @@ -753,7 +753,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, OutVector& outVector, int gridIdx ) const { - IndexType globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + IndexType globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; const IndexType warpStart = warpSize * ( globalIdx / warpSize ); const IndexType warpEnd = min( warpStart + warpSize, this->getRows() ); const IndexType inWarpIdx = globalIdx % warpSize; @@ -764,7 +764,7 @@ void CSR< Real, Device, Index >::vectorProductCuda( const InVector& inVector, /**** * Hybrid mode */ - const Index firstRow = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; + const Index firstRow = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x; const IndexType lastRow = min( this->getRows(), firstRow + blockDim. x ); const IndexType nonzerosPerRow = ( this->rowPointers[ lastRow ] - this->rowPointers[ firstRow ] ) / ( lastRow - firstRow ); @@ -828,7 +828,7 @@ __global__ void CSRVectorProductCudaKernel( const CSR< Real, Devices::Cuda, Inde { typedef CSR< Real, Devices::Cuda, Index > Matrix; static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); - const typename Matrix::IndexType rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( matrix->getCudaKernelType() == Matrix::scalar ) { if( rowIdx < matrix->getRows() ) @@ -854,17 +854,17 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, #ifdef HAVE_CUDA typedef CSR< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); TNL_CHECK_CUDA_DEVICE; - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); if( matrix.getCudaWarpSize() == 32 ) CSRVectorProductCudaKernel< Real, Index, InVector, OutVector, 32 > @@ -911,9 +911,9 @@ void CSRVectorProductCuda( const CSR< Real, Devices::Cuda, Index >& matrix, } TNL_CHECK_CUDA_DEVICE; - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/ChunkedEllpack_impl.h b/src/TNL/Matrices/ChunkedEllpack_impl.h index 5bc0cfd27a2aa9e9519f57cc732cfe63605f6478..89e525e87bde4a752d2bd7cb4e17a4f27a79a79d 100644 --- a/src/TNL/Matrices/ChunkedEllpack_impl.h +++ b/src/TNL/Matrices/ChunkedEllpack_impl.h @@ -1123,7 +1123,7 @@ __device__ void ChunkedEllpack< Real, Device, Index >::computeSliceVectorProduct { static_assert( std::is_same < DeviceType, Devices::Cuda >::value, "" ); - RealType* chunkProducts = Devices::Cuda::getSharedMemory< RealType >(); + RealType* chunkProducts = Cuda::getSharedMemory< RealType >(); ChunkedEllpackSliceInfo* sliceInfo = ( ChunkedEllpackSliceInfo* ) & chunkProducts[ blockDim.x ]; if( threadIdx.x == 0 ) @@ -1403,7 +1403,7 @@ __global__ void ChunkedEllpackVectorProductCudaKernel( const ChunkedEllpack< Rea OutVector* outVector, int gridIdx ) { - const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x; + const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() + blockIdx.x; if( sliceIdx < matrix->getNumberOfSlices() ) matrix->computeSliceVectorProduct( inVector, outVector, sliceIdx ); @@ -1456,19 +1456,19 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda > typedef ChunkedEllpack< Real, Devices::Cuda, Index > Matrix; typedef Index IndexType; typedef Real RealType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); dim3 cudaBlockSize( matrix.getNumberOfChunksInSlice() ), - cudaGridSize( Devices::Cuda::getMaxGridSize() ); + cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = matrix.getNumberOfSlices(); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); const IndexType sharedMemory = cudaBlockSize.x * sizeof( RealType ) + sizeof( tnlChunkedEllpackSliceInfo< IndexType > ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); ChunkedEllpackVectorProductCudaKernel< Real, Index, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> ( kernel_this, @@ -1476,9 +1476,9 @@ class ChunkedEllpackDeviceDependentCode< Devices::Cuda > kernel_outVector, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/Dense_impl.h b/src/TNL/Matrices/Dense_impl.h index f5767119744ca2e24568223d5a679a7d2e340cdf..246bd09edb459e6df9749af9d1589f508c2c5806 100644 --- a/src/TNL/Matrices/Dense_impl.h +++ b/src/TNL/Matrices/Dense_impl.h @@ -586,20 +586,20 @@ void Dense< Real, Device, Index >::getMatrixProduct( const Matrix1& matrix1, const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; cudaBlockSize.y = cudaBlockRows; - const IndexType rowGrids = roundUpDivision( rowTiles, Devices::Cuda::getMaxGridSize() ); - const IndexType columnGrids = roundUpDivision( columnTiles, Devices::Cuda::getMaxGridSize() ); + const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); + const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { - cudaGridSize.x = cudaGridSize.y = Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1 ) - cudaGridSize.x = columnTiles % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) - cudaGridSize.y = rowTiles % Devices::Cuda::getMaxGridSize(); - Dense* this_kernel = Devices::Cuda::passToDevice( *this ); - Matrix1* matrix1_kernel = Devices::Cuda::passToDevice( matrix1 ); - Matrix2* matrix2_kernel = Devices::Cuda::passToDevice( matrix2 ); + cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); + Dense* this_kernel = Cuda::passToDevice( *this ); + Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 ); + Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 ); DenseMatrixProductKernel< Real, Index, Matrix1, @@ -616,9 +616,9 @@ void Dense< Real, Device, Index >::getMatrixProduct( const Matrix1& matrix1, matrix2Multiplicator, gridIdx_x, gridIdx_y ); - Devices::Cuda::freeFromDevice( this_kernel ); - Devices::Cuda::freeFromDevice( matrix1_kernel ); - Devices::Cuda::freeFromDevice( matrix2_kernel ); + Cuda::freeFromDevice( this_kernel ); + Cuda::freeFromDevice( matrix1_kernel ); + Cuda::freeFromDevice( matrix2_kernel ); } #endif } @@ -669,7 +669,7 @@ __global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Ind rowBlock < tileDim; rowBlock += tileRowBlockSize ) { - tile[ Devices::Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = + tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } @@ -688,7 +688,7 @@ __global__ void DenseTranspositionAlignedKernel( Dense< Real, Devices::Cuda, Ind { resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, - matrixMultiplicator * tile[ Devices::Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); + matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } @@ -741,7 +741,7 @@ __global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, rowBlock += tileRowBlockSize ) { if( readRowPosition + rowBlock < rows ) - tile[ Devices::Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = + tile[ Cuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] = inputMatrix->getElementFast( readColumnPosition, readRowPosition + rowBlock ); } @@ -765,7 +765,7 @@ __global__ void DenseTranspositionNonAlignedKernel( Dense< Real, Devices::Cuda, if( writeRowPosition + rowBlock < columns ) resultMatrix->setElementFast( writeColumnPosition, writeRowPosition + rowBlock, - matrixMultiplicator * tile[ Devices::Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); + matrixMultiplicator * tile[ Cuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] ); } } @@ -809,21 +809,21 @@ void Dense< Real, Device, Index >::getTransposition( const Matrix& matrix, const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim ); cudaBlockSize.x = cudaBlockColumns; cudaBlockSize.y = cudaBlockRows; - const IndexType rowGrids = roundUpDivision( rowTiles, Devices::Cuda::getMaxGridSize() ); - const IndexType columnGrids = roundUpDivision( columnTiles, Devices::Cuda::getMaxGridSize() ); - const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Devices::Cuda::getNumberOfSharedMemoryBanks(); + const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() ); + const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() ); + const IndexType sharedMemorySize = tileDim*tileDim + tileDim*tileDim/Cuda::getNumberOfSharedMemoryBanks(); - Dense* this_device = Devices::Cuda::passToDevice( *this ); - Matrix* matrix_device = Devices::Cuda::passToDevice( matrix ); + Dense* this_device = Cuda::passToDevice( *this ); + Matrix* matrix_device = Cuda::passToDevice( matrix ); for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ ) for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ ) { - cudaGridSize.x = cudaGridSize.y = Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize(); if( gridIdx_x == columnGrids - 1) - cudaGridSize.x = columnTiles % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = columnTiles % Cuda::getMaxGridSize(); if( gridIdx_y == rowGrids - 1 ) - cudaGridSize.y = rowTiles % Devices::Cuda::getMaxGridSize(); + cudaGridSize.y = rowTiles % Cuda::getMaxGridSize(); if( ( gridIdx_x < columnGrids - 1 || matrix.getColumns() % tileDim == 0 ) && ( gridIdx_y < rowGrids - 1 || matrix.getRows() % tileDim == 0 ) ) { @@ -859,8 +859,8 @@ void Dense< Real, Device, Index >::getTransposition( const Matrix& matrix, } TNL_CHECK_CUDA_DEVICE; } - Devices::Cuda::freeFromDevice( this_device ); - Devices::Cuda::freeFromDevice( matrix_device ); + Cuda::freeFromDevice( this_device ); + Cuda::freeFromDevice( matrix_device ); #endif } } diff --git a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h index b817372dc5620971df6f64251fbd2f716c5416e7..b949292c5f1664562525a4ead8ca17b2ad9f343b 100644 --- a/src/TNL/Matrices/EllpackSymmetricGraph_impl.h +++ b/src/TNL/Matrices/EllpackSymmetricGraph_impl.h @@ -54,7 +54,7 @@ void EllpackSymmetricGraph< Real, Device, Index >::setDimensions( const IndexTyp this->rows = rows; this->columns = columns; if( std::is_same< DeviceType, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); + this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() ); else this->alignedRows = rows; if( this->rowLengths != 0 ) allocateElements(); @@ -917,7 +917,7 @@ void EllpackSymmetricGraphVectorProductCuda( const EllpackSymmetricGraph< Real, const int gridIdx, const int color ) { - int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; matrix->spmvCuda( *inVector, *outVector, globalIdx, color ); } #endif @@ -966,19 +966,19 @@ class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef EllpackSymmetricGraph< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) { IndexType rows = matrix.getRowsOfColor( color ); const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); EllpackSymmetricGraphVectorProductCuda< Real, Index, InVector, OutVector > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, @@ -989,9 +989,9 @@ class EllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > } } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/EllpackSymmetric_impl.h b/src/TNL/Matrices/EllpackSymmetric_impl.h index 5890212a4b5c207087f35bdacbcf3d097bb75a67..90369f77af0f0085b140934c27fe3fe5a2d8f015 100644 --- a/src/TNL/Matrices/EllpackSymmetric_impl.h +++ b/src/TNL/Matrices/EllpackSymmetric_impl.h @@ -38,7 +38,7 @@ void EllpackSymmetric< Real, Device, Index >::setDimensions( const IndexType row this->rows = rows; this->columns = columns; if( std::is_same< DeviceType, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( columns, Devices::Cuda::getWarpSize() ); + this->alignedRows = roundToMultiple( columns, Cuda::getWarpSize() ); else this->alignedRows = rows; if( this->rowLengths != 0 ) allocateElements(); @@ -708,7 +708,7 @@ void EllpackSymmetricVectorProductCuda( const EllpackSymmetric< Real, Devices::C OutVector* outVector, const int gridIdx ) { - int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( globalIdx >= matrix->getRows() ) return; matrix->spmvCuda( *inVector, *outVector, globalIdx ); @@ -760,16 +760,16 @@ class EllpackSymmetricDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef EllpackSymmetric< Real, Devices::Cuda, Index > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); const int sharedMemory = cudaBlockSize.x * sizeof( Real ); EllpackSymmetricVectorProductCuda< Real, Index, InVector, OutVector > <<< cudaGridSize, cudaBlockSize, sharedMemory >>> @@ -778,9 +778,9 @@ class EllpackSymmetricDeviceDependentCode< Devices::Cuda > kernel_outVector, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/Ellpack_impl.h b/src/TNL/Matrices/Ellpack_impl.h index 2b8675c0469d93bd33a54901e1324256c43b0e31..7651ea0d7af455351a731b29b768906114227945 100644 --- a/src/TNL/Matrices/Ellpack_impl.h +++ b/src/TNL/Matrices/Ellpack_impl.h @@ -60,7 +60,7 @@ void Ellpack< Real, Device, Index >::setDimensions( const IndexType rows, this->rows = rows; this->columns = columns; if( std::is_same< Device, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( rows, Devices::Cuda::getWarpSize() ); + this->alignedRows = roundToMultiple( rows, Cuda::getWarpSize() ); else this->alignedRows = rows; if( this->rowLengths != 0 ) allocateElements(); @@ -128,7 +128,7 @@ void Ellpack< Real, Device, Index >::setLike( const Ellpack< Real2, Device2, Ind Sparse< Real, Device, Index >::setLike( matrix ); this->rowLengths = matrix.rowLengths; if( std::is_same< Device, Devices::Cuda >::value ) - this->alignedRows = roundToMultiple( this->getRows(), Devices::Cuda::getWarpSize() ); + this->alignedRows = roundToMultiple( this->getRows(), Cuda::getWarpSize() ); else this->alignedRows = this->getRows(); } @@ -836,7 +836,7 @@ __global__ void EllpackVectorProductCudaKernel( Real multiplicator, const Index gridIdx ) { - const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; Index i = rowIdx; @@ -902,16 +902,16 @@ class EllpackDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef Ellpack< Real, Device, Index > Matrix; typedef typename Matrix::IndexType IndexType; - //Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - //InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - //OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + //Matrix* kernel_this = Cuda::passToDevice( matrix ); + //InVector* kernel_inVector = Cuda::passToDevice( inVector ); + //OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); EllpackVectorProductCudaKernel < Real, Index > <<< cudaGridSize, cudaBlockSize >>> @@ -928,9 +928,9 @@ class EllpackDeviceDependentCode< Devices::Cuda > gridIdx ); TNL_CHECK_CUDA_DEVICE; } - //Devices::Cuda::freeFromDevice( kernel_this ); - //Devices::Cuda::freeFromDevice( kernel_inVector ); - //Devices::Cuda::freeFromDevice( kernel_outVector ); + //Cuda::freeFromDevice( kernel_this ); + //Cuda::freeFromDevice( kernel_inVector ); + //Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); #endif diff --git a/src/TNL/Matrices/MatrixOperations.h b/src/TNL/Matrices/MatrixOperations.h index 07991a573662c7380d6fd2814b0a20db30e7dca8..a6ede3f7bd1dd593f22d055cf4ac2426cbc58d35 100644 --- a/src/TNL/Matrices/MatrixOperations.h +++ b/src/TNL/Matrices/MatrixOperations.h @@ -21,6 +21,8 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Math.h> +#include <TNL/Cuda/DeviceInfo.h> +#include <TNL/Cuda/SharedMemory.h> namespace TNL { namespace Matrices { @@ -248,7 +250,7 @@ GemvCudaKernel( const IndexType m, IndexType elementIdx = blockIdx.x * blockDim.x + threadIdx.x; const IndexType gridSize = blockDim.x * gridDim.x; - RealType* shx = Devices::Cuda::getSharedMemory< RealType >(); + RealType* shx = Cuda::getSharedMemory< RealType >(); if( threadIdx.x < n ) shx[ threadIdx.x ] = alpha * x[ threadIdx.x ]; @@ -344,10 +346,10 @@ public: Containers::Algorithms::ArrayOperations< Devices::Cuda, Devices::Host >::copy< RealType, RealType, IndexType >( xDevice.getData(), x, n ); // desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); dim3 blockSize, gridSize; blockSize.x = 256; - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( m, blockSize.x ) ); GemvCudaKernel<<< gridSize, blockSize, n * sizeof( RealType ) >>>( m, n, @@ -401,9 +403,9 @@ public: blockSize.x /= 2; // desGridSize = blocksPerMultiprocessor * numberOfMultiprocessors - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( m, blockSize.x ) ); - gridSize.y = Devices::Cuda::getNumberOfBlocks( n, blockSize.y ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( m, blockSize.x ) ); + gridSize.y = Cuda::getNumberOfBlocks( n, blockSize.y ); GeamCudaKernel<<< gridSize, blockSize >>>( m, n, diff --git a/src/TNL/Matrices/Matrix_impl.h b/src/TNL/Matrices/Matrix_impl.h index 7472760c23ec6d8df8a4295457cd6773e4ac80df..33c4d2e654cb32f9ba56516a1678b73d17ee3b96 100644 --- a/src/TNL/Matrices/Matrix_impl.h +++ b/src/TNL/Matrices/Matrix_impl.h @@ -12,6 +12,9 @@ #include <TNL/Matrices/Matrix.h> #include <TNL/Assert.h> +#include <TNL/Cuda/LaunchHelpers.h> +#include <TNL/Cuda/MemoryHelpers.h> +#include <TNL/Cuda/SharedMemory.h> namespace TNL { namespace Matrices { @@ -240,7 +243,7 @@ __global__ void MatrixVectorProductCudaKernel( const Matrix* matrix, int gridIdx ) { static_assert( std::is_same< typename Matrix::DeviceType, Devices::Cuda >::value, "" ); - const typename Matrix::IndexType rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const typename Matrix::IndexType rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < matrix->getRows() ) ( *outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, *inVector ); } @@ -255,16 +258,16 @@ void MatrixVectorProductCuda( const Matrix& matrix, { #ifdef HAVE_CUDA typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); MatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_this, kernel_inVector, @@ -272,9 +275,9 @@ void MatrixVectorProductCuda( const Matrix& matrix, gridIdx ); TNL_CHECK_CUDA_DEVICE; } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h index f9ef284dac30cb59a973314719bfe3481cdc80e3..bfe73f231092a0e4ea90c3011b823c6ab8c17d95 100644 --- a/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h +++ b/src/TNL/Matrices/SlicedEllpackSymmetricGraph_impl.h @@ -1095,7 +1095,7 @@ __global__ void SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_Cuda typename SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVector rowLengths, int gridIdx ) { - const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); } #endif @@ -1152,7 +1152,7 @@ void SlicedEllpackSymmetricGraphVectorProductCuda( const SlicedEllpackSymmetricG const int color, const int sliceOffset ) { - int globalIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset; + int globalIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x + sliceOffset; matrix->smvCuda( *inVector, *outVector, globalIdx, color ); } #endif @@ -1213,21 +1213,21 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpackSymmetricGraph< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix ); + Matrix* kernel_matrix = Cuda::passToDevice( matrix ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); SlicedEllpackSymmetricGraph_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, rowLengths, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_matrix ); + Cuda::freeFromDevice( kernel_matrix ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -1245,10 +1245,10 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpackSymmetricGraph< Real, Devices::Cuda, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); for( IndexType color = 0; color < matrix.getNumberOfColors(); color++ ) { IndexType offset = matrix.colorPointers.getElement( color ); //can be computed in kernel @@ -1258,11 +1258,11 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > //IndexType rows = matrix.colorPointers.getElement( color + 1 ) - matrix.colorPointers.getElement( color ) + inSliceIdx; // TODO: rows id undefined /*const IndexType cudaBlocks = roundUpDivision( rows, cudaBlockSize.x ); - const IndexType cudaGrids = rondUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize ); + const IndexType cudaGrids = rondUpDivision( cudaBlocks, Cuda::getMaxGridSize ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); // TODO: this cannot be used here and i is undefined //IndexType offset = this->colorPointers[ i ]; IndexType inSliceIdx = offset % SliceSize; @@ -1277,9 +1277,9 @@ class SlicedEllpackSymmetricGraphDeviceDependentCode< Devices::Cuda > sliceOffset ); }*/ } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h index edc645688dc8ad5fedbeb74b1c6f55dd32fb1af8..f53efdc0c0db21c9d0ab12de2776a53e9e2f803c 100644 --- a/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h +++ b/src/TNL/Matrices/SlicedEllpackSymmetric_impl.h @@ -512,7 +512,7 @@ const SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >* matrix, OutVector* outVector, int gridIdx ) { - int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; matrix->spmvCuda( *inVector, *outVector, rowIdx ); } #endif @@ -784,7 +784,7 @@ __global__ void SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKerne typename SlicedEllpackSymmetric< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, int gridIdx ) { - const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); } #endif @@ -843,21 +843,21 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::RowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix ); + Matrix* kernel_matrix = Cuda::passToDevice( matrix ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); SlicedEllpackSymmetric_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, rowLengths, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_matrix ); + Cuda::freeFromDevice( kernel_matrix ); TNL_CHECK_CUDA_DEVICE; #endif } @@ -874,16 +874,16 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpackSymmetric< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; - Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + Matrix* kernel_this = Cuda::passToDevice( matrix ); + InVector* kernel_inVector = Cuda::passToDevice( inVector ); + OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); SlicedEllpackSymmetricVectorProductCudaKernel< Real, Index, SliceSize, InVector, OutVector > <<< cudaGridSize, cudaBlockSize >>> ( kernel_this, @@ -891,9 +891,9 @@ class SlicedEllpackSymmetricDeviceDependentCode< Devices::Cuda > kernel_outVector, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inVector ); - Devices::Cuda::freeFromDevice( kernel_outVector ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inVector ); + Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Matrices/SlicedEllpack_impl.h b/src/TNL/Matrices/SlicedEllpack_impl.h index c6caa56393d265931c6a8e12f4df3400b8e560b2..9f5875f1717e80e971ee0c735a9713dea5eb3973 100644 --- a/src/TNL/Matrices/SlicedEllpack_impl.h +++ b/src/TNL/Matrices/SlicedEllpack_impl.h @@ -876,7 +876,7 @@ __global__ void SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel( Sliced typename SlicedEllpack< Real, Devices::Cuda, Index, SliceSize >::ConstCompressedRowLengthsVectorView rowLengths, int gridIdx ) { - const Index sliceIdx = gridIdx * Devices::Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; + const Index sliceIdx = gridIdx * Cuda::getMaxGridSize() * blockDim.x + blockIdx.x * blockDim.x + threadIdx.x; matrix->computeMaximalRowLengthInSlicesCuda( rowLengths, sliceIdx ); } #endif @@ -899,7 +899,7 @@ __global__ void SlicedEllpackVectorProductCudaKernel( Real multiplicator, const Index gridIdx ) { - const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx >= rows ) return; const Index sliceIdx = rowIdx / SliceSize; @@ -975,21 +975,21 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::CompressedRowLengthsVector CompressedRowLengthsVector; - Matrix* kernel_matrix = Devices::Cuda::passToDevice( matrix ); + Matrix* kernel_matrix = Cuda::passToDevice( matrix ); const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const Index cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x ); - const Index cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const Index cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( int gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); SlicedEllpack_computeMaximalRowLengthInSlices_CudaKernel< Real, Index, SliceSize ><<< cudaGridSize, cudaBlockSize >>> ( kernel_matrix, rowLengths, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_matrix ); + Cuda::freeFromDevice( kernel_matrix ); TNL_CHECK_CUDA_DEVICE; #endif return true; @@ -1009,16 +1009,16 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda > #ifdef HAVE_CUDA typedef SlicedEllpack< Real, Device, Index, SliceSize > Matrix; typedef typename Matrix::IndexType IndexType; - //Matrix* kernel_this = Devices::Cuda::passToDevice( matrix ); - //InVector* kernel_inVector = Devices::Cuda::passToDevice( inVector ); - //OutVector* kernel_outVector = Devices::Cuda::passToDevice( outVector ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + //Matrix* kernel_this = Cuda::passToDevice( matrix ); + //InVector* kernel_inVector = Cuda::passToDevice( inVector ); + //OutVector* kernel_outVector = Cuda::passToDevice( outVector ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); SlicedEllpackVectorProductCudaKernel < Real, Index, SliceSize > <<< cudaGridSize, cudaBlockSize >>> @@ -1035,9 +1035,9 @@ class SlicedEllpackDeviceDependentCode< Devices::Cuda > gridIdx ); TNL_CHECK_CUDA_DEVICE; } - //Devices::Cuda::freeFromDevice( kernel_this ); - //Devices::Cuda::freeFromDevice( kernel_inVector ); - //Devices::Cuda::freeFromDevice( kernel_outVector ); + //Cuda::freeFromDevice( kernel_this ); + //Cuda::freeFromDevice( kernel_inVector ); + //Cuda::freeFromDevice( kernel_outVector ); TNL_CHECK_CUDA_DEVICE; cudaDeviceSynchronize(); #endif diff --git a/src/TNL/Matrices/SparseOperations_impl.h b/src/TNL/Matrices/SparseOperations_impl.h index ccc8930f9bb99825d89de1f7df6de5ce31fa427d..b6d118bf22e39c9ee345269c2a2cc28760b3a198 100644 --- a/src/TNL/Matrices/SparseOperations_impl.h +++ b/src/TNL/Matrices/SparseOperations_impl.h @@ -130,8 +130,8 @@ copySparseMatrix_impl( Matrix1& A, const Matrix2& B ) #ifdef HAVE_CUDA dim3 blockSize( 256 ); dim3 gridSize; - const IndexType desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( rows, blockSize.x ) ); + const IndexType desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( rows, blockSize.x ) ); typename Matrix1::CompressedRowLengthsVector rowLengths; rowLengths.setSize( rows ); diff --git a/src/TNL/Matrices/SparseRow.h b/src/TNL/Matrices/SparseRow.h index c7ebd07039061fdb775a583ac065e0a71c5f4869..f66cd2ceaf1c6f0cd882bb962a78c6649816aa75 100644 --- a/src/TNL/Matrices/SparseRow.h +++ b/src/TNL/Matrices/SparseRow.h @@ -14,7 +14,7 @@ #include <type_traits> #include <ostream> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { namespace Matrices { diff --git a/src/TNL/Matrices/Tridiagonal_impl.h b/src/TNL/Matrices/Tridiagonal_impl.h index 78e798c2bf0a43213d14a89af6f6a922d3b23ca0..62575f1776144e374b560a65e213248a1177de80 100644 --- a/src/TNL/Matrices/Tridiagonal_impl.h +++ b/src/TNL/Matrices/Tridiagonal_impl.h @@ -452,7 +452,7 @@ __global__ void TridiagonalTranspositionCudaKernel( const Tridiagonal< Real2, De const Real matrixMultiplicator, const Index gridIdx ) { - const Index rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + const Index rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( rowIdx < inMatrix->getRows() ) { if( rowIdx > 0 ) @@ -494,24 +494,24 @@ void Tridiagonal< Real, Device, Index >::getTransposition( const Tridiagonal< Re if( std::is_same< Device, Devices::Cuda >::value ) { #ifdef HAVE_CUDA - Tridiagonal* kernel_this = Devices::Cuda::passToDevice( *this ); + Tridiagonal* kernel_this = Cuda::passToDevice( *this ); typedef Tridiagonal< Real2, Device, Index2 > InMatrixType; - InMatrixType* kernel_inMatrix = Devices::Cuda::passToDevice( matrix ); - dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() ); + InMatrixType* kernel_inMatrix = Cuda::passToDevice( matrix ); + dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() ); const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x ); - const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() ); + const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() ); for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) { if( gridIdx == cudaGrids - 1 ) - cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize(); + cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize(); TridiagonalTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>> ( kernel_inMatrix, kernel_this, matrixMultiplicator, gridIdx ); } - Devices::Cuda::freeFromDevice( kernel_this ); - Devices::Cuda::freeFromDevice( kernel_inMatrix ); + Cuda::freeFromDevice( kernel_this ); + Cuda::freeFromDevice( kernel_inMatrix ); TNL_CHECK_CUDA_DEVICE; #endif } diff --git a/src/TNL/Meshes/Geometry/getEntityCenter.h b/src/TNL/Meshes/Geometry/getEntityCenter.h index 59cd950ca180cdbf8095382536544e9fe0ffde51..a37c27acf00523341382a7f44c5ef74adfd14d45 100644 --- a/src/TNL/Meshes/Geometry/getEntityCenter.h +++ b/src/TNL/Meshes/Geometry/getEntityCenter.h @@ -10,7 +10,7 @@ #pragma once -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Meshes/GridEntity.h> #include <TNL/Meshes/Mesh.h> #include <TNL/Meshes/MeshEntity.h> diff --git a/src/TNL/Meshes/Geometry/getEntityMeasure.h b/src/TNL/Meshes/Geometry/getEntityMeasure.h index 7402e4f6db3226b534caf6753958fc10e4efdb1a..a3381ed96b1e72ddc2f0ccf25bbbdccd7e71739a 100644 --- a/src/TNL/Meshes/Geometry/getEntityMeasure.h +++ b/src/TNL/Meshes/Geometry/getEntityMeasure.h @@ -10,7 +10,7 @@ #pragma once -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Meshes/GridEntity.h> #include <TNL/Meshes/Mesh.h> #include <TNL/Meshes/MeshEntity.h> diff --git a/src/TNL/Meshes/GridDetails/GridTraverser.h b/src/TNL/Meshes/GridDetails/GridTraverser.h index 7ce106f5d9c9e05160490d59434b7d170298a993..e8702153fb03343f85badc160676b1c788eaa948 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser.h +++ b/src/TNL/Meshes/GridDetails/GridTraverser.h @@ -12,7 +12,6 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Pointers/SharedPointer.h> -#include <TNL/CudaStreamPool.h> namespace TNL { namespace Meshes { diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp index 53370853824a8fde7f47bf28c0a68de76f84a2b0..796ffe491d572c108f7b8072769e42e9d5c9571d 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_1D.hpp @@ -16,7 +16,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Pointers/SharedPointer.h> -#include <TNL/CudaStreamPool.h> +#include <TNL/Cuda/StreamPool.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Meshes/GridDetails/GridTraverser.h> #include <TNL/Exceptions/NotImplementedError.h> @@ -120,7 +120,7 @@ GridTraverser1D( typedef Meshes::Grid< 1, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = begin.x() + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; + coordinates.x() = begin.x() + ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x; if( coordinates <= end ) { GridEntity entity( *grid, coordinates ); @@ -182,7 +182,7 @@ processEntities( const int& stream ) { #ifdef HAVE_CUDA - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); @@ -200,7 +200,7 @@ processEntities( else { dim3 blockSize( 256 ), blocksCount, gridsCount; - Devices::Cuda::setupThreads( + Cuda::setupThreads( blockSize, blocksCount, gridsCount, @@ -209,7 +209,7 @@ processEntities( for( gridIdx.x = 0; gridIdx.x < gridsCount.x; gridIdx.x++ ) { dim3 gridSize; - Devices::Cuda::setupGrid( + Cuda::setupGrid( blocksCount, gridsCount, gridIdx, @@ -225,8 +225,8 @@ processEntities( /*dim3 cudaBlockSize( 256 ); dim3 cudaBlocks; - cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); - const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x ); + cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x ); + const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x ); for( IndexType gridXIdx = 0; gridXIdx < cudaXGrids; gridXIdx ++ ) GridTraverser1D< Real, Index, GridEntity, UserData, EntitiesProcessor > diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp index 3efdb478fd9d8874a730d48deff8849733f0b13e..15c5a0eda13a10bd512fe69930bed1e9c90f068d 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_2D.hpp @@ -12,7 +12,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Pointers/SharedPointer.h> -#include <TNL/CudaStreamPool.h> +#include <TNL/Cuda/StreamPool.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Meshes/GridDetails/GridTraverser.h> @@ -148,8 +148,8 @@ GridTraverser2D( typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx ); if( coordinates <= end ) { @@ -186,7 +186,7 @@ GridTraverser2DBoundaryAlongX( typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; if( coordinates.x() <= endX ) @@ -222,7 +222,7 @@ GridTraverser2DBoundaryAlongY( typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = beginY + Cuda::getGlobalThreadIdx_x( gridIdx ); if( coordinates.y() <= endY ) { @@ -291,10 +291,10 @@ GridTraverser2DBoundary( /*const Index aux = max( entitiesAlongX, entitiesAlongY ); - const Index& warpSize = Devices::Cuda::getWarpSize(); + const Index& warpSize = Cuda::getWarpSize(); const Index threadsPerAxis = warpSize * ( aux / warpSize + ( aux % warpSize != 0 ) ); - Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx ); GridEntity entity( *grid, CoordinatesType( 0, 0 ), gridEntityParameters... ); @@ -414,10 +414,10 @@ processEntities( dim3 cudaBlockSize( 256 ); dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX, cudaBlocksCountAlongY, cudaGridsCountAlongY; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 ); - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); @@ -425,8 +425,8 @@ processEntities( dim3 gridIdx, cudaGridSize; for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize ); + //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s1 >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -450,7 +450,7 @@ processEntities( const cudaStream_t& s4 = pool.getStream( stream + 3 ); for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); + Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize ); GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -482,15 +482,15 @@ processEntities( const IndexType maxFaceSize = max( entitiesAlongX, entitiesAlongY ); const IndexType blocksPerFace = maxFaceSize / cudaBlockSize.x + ( maxFaceSize % cudaBlockSize.x != 0 ); IndexType cudaThreadsCount = 4 * cudaBlockSize.x * blocksPerFace; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount ); //std::cerr << "blocksPerFace = " << blocksPerFace << "Threads count = " << cudaThreadsCount // << "cudaBlockCount = " << cudaBlocksCount.x << std::endl; dim3 gridIdx, cudaGridSize; Devices::Cuda::synchronizeDevice(); for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); + Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCountAlongX, cudaGridSize, cudaGridsCountAlongX ); GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -511,11 +511,11 @@ processEntities( { dim3 cudaBlockSize( 16, 16 ); dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1 ); - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); @@ -523,8 +523,8 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); - //Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); + Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize ); + //Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount ); GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaGridSize, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), diff --git a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp index 24200c15dd89661492142c5f74f3166feb5d7ed6..489e58d779b57e7b5afea4b2d99516fb23b04f31 100644 --- a/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp +++ b/src/TNL/Meshes/GridDetails/GridTraverser_3D.hpp @@ -12,7 +12,7 @@ #include <TNL/Meshes/Grid.h> #include <TNL/Pointers/SharedPointer.h> -#include <TNL/CudaStreamPool.h> +#include <TNL/Cuda/StreamPool.h> #include <TNL/Exceptions/CudaSupportMissing.h> #include <TNL/Meshes/GridDetails/GridTraverser.h> #include <TNL/Exceptions/NotImplementedError.h> @@ -177,9 +177,9 @@ GridTraverser3D( typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); - coordinates.z() = begin.z() + Devices::Cuda::getGlobalThreadIdx_z( gridIdx ); + coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = begin.z() + Cuda::getGlobalThreadIdx_z( gridIdx ); if( coordinates <= end ) { @@ -217,8 +217,8 @@ GridTraverser3DBoundaryAlongXY( typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.y() = beginY + Cuda::getGlobalThreadIdx_y( gridIdx ); coordinates.z() = fixedZ; if( coordinates.x() <= endX && coordinates.y() <= endY ) @@ -254,9 +254,9 @@ GridTraverser3DBoundaryAlongXZ( typedef Meshes::Grid< 3, Real, Devices::Cuda, Index > GridType; typename GridType::CoordinatesType coordinates; - coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.x() = beginX + Cuda::getGlobalThreadIdx_x( gridIdx ); coordinates.y() = fixedY; - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.z() = beginZ + Cuda::getGlobalThreadIdx_y( gridIdx ); if( coordinates.x() <= endX && coordinates.z() <= endZ ) { @@ -292,8 +292,8 @@ GridTraverser3DBoundaryAlongYZ( typename GridType::CoordinatesType coordinates; coordinates.x() = fixedX; - coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx ); - coordinates.z() = beginZ + Devices::Cuda::getGlobalThreadIdx_y( gridIdx ); + coordinates.y() = beginY + Cuda::getGlobalThreadIdx_x( gridIdx ); + coordinates.z() = beginZ + Cuda::getGlobalThreadIdx_y( gridIdx ); if( coordinates.y() <= endY && coordinates.z() <= endZ ) { @@ -341,11 +341,11 @@ processEntities( dim3 cudaBlocksCountAlongXY, cudaBlocksCountAlongXZ, cudaBlocksCountAlongYZ, cudaGridsCountAlongXY, cudaGridsCountAlongXZ, cudaGridsCountAlongYZ; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXY, cudaGridsCountAlongXY, entitiesAlongX, entitiesAlongY ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, entitiesAlongX, entitiesAlongZ - 2 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, entitiesAlongY - 2, entitiesAlongZ - 2 ); - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); Devices::Cuda::synchronizeDevice(); const cudaStream_t& s1 = pool.getStream( stream ); @@ -359,7 +359,7 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXY.y; gridIdx.y++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXY.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); + Cuda::setupGrid( cudaBlocksCountAlongXY, cudaGridsCountAlongXY, gridIdx, gridSize ); GridTraverser3DBoundaryAlongXY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksCountAlongXY, cudaBlockSize, 0 , s1 >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -386,7 +386,7 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongXZ.y; gridIdx.y++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongXZ.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); + Cuda::setupGrid( cudaBlocksCountAlongXZ, cudaGridsCountAlongXZ, gridIdx, gridSize ); GridTraverser3DBoundaryAlongXZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksCountAlongXZ, cudaBlockSize, 0, s3 >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -413,7 +413,7 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCountAlongYZ.y; gridIdx.y++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongYZ.x; gridIdx.x++ ) { - Devices::Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); + Cuda::setupGrid( cudaBlocksCountAlongYZ, cudaGridsCountAlongYZ, gridIdx, gridSize ); GridTraverser3DBoundaryAlongYZ< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< cudaBlocksCountAlongYZ, cudaBlockSize, 0, s5 >>> ( &gridPointer.template getData< Devices::Cuda >(), @@ -450,12 +450,12 @@ processEntities( dim3 cudaBlockSize( 8, 8, 8 ); dim3 cudaBlocksCount, cudaGridsCount; - Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, - end.x() - begin.x() + 1, - end.y() - begin.y() + 1, - end.z() - begin.z() + 1 ); + Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, + end.x() - begin.x() + 1, + end.y() - begin.y() + 1, + end.z() - begin.z() + 1 ); - auto& pool = CudaStreamPool::getInstance(); + auto& pool = Cuda::StreamPool::getInstance(); const cudaStream_t& s = pool.getStream( stream ); Devices::Cuda::synchronizeDevice(); @@ -464,7 +464,7 @@ processEntities( for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ ) for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ ) { - Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); + Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, gridSize ); GridTraverser3D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... > <<< gridSize, cudaBlockSize, 0, s >>> ( &gridPointer.template getData< Devices::Cuda >(), diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h index dd9562add377b02c3ab9ca91fa4804762182b46c..29fe8ffd67d0e7e0550abd6540391d24d5a0205d 100644 --- a/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h +++ b/src/TNL/Meshes/GridDetails/NeighborGridEntitiesStorage.h @@ -10,7 +10,7 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Meshes/DimensionTag.h> #include <TNL/Meshes/GridEntityConfig.h> #include <TNL/Meshes/GridDetails/NeighborGridEntityGetter.h> diff --git a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h index 84a9c56d9389f31c013206c10f63fceb81ec2e0c..f7a3cc180fa3da51b8830f8001e8d616e621f891 100644 --- a/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h +++ b/src/TNL/Meshes/GridDetails/NeighborGridEntityGetter.h @@ -11,7 +11,7 @@ #pragma once #include <TNL/Assert.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Meshes/GridEntityConfig.h> namespace TNL { diff --git a/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h b/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h index 64485dc3c7ecb31ebdeb9891830244776550e315..110fa9eefc1ca498435ce2fd11d1d84df2ad4410 100644 --- a/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h +++ b/src/TNL/Meshes/MeshDetails/MeshEntityIndex.h @@ -17,7 +17,7 @@ #pragma once #include <TNL/Assert.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { namespace Meshes { diff --git a/src/TNL/Meshes/MeshDetails/Traverser_impl.h b/src/TNL/Meshes/MeshDetails/Traverser_impl.h index 5dedf58fd2ceea521e51ece53e42d0efd65caec1..27cbba714160986d156c4c5e828274df53fb1bf3 100644 --- a/src/TNL/Meshes/MeshDetails/Traverser_impl.h +++ b/src/TNL/Meshes/MeshDetails/Traverser_impl.h @@ -13,6 +13,8 @@ #include <TNL/Meshes/Traverser.h> #include <TNL/Exceptions/CudaSupportMissing.h> +#include <TNL/Cuda/DeviceInfo.h> +#include <TNL/Cuda/LaunchHelpers.h> namespace TNL { namespace Meshes { @@ -159,8 +161,8 @@ processBoundaryEntities( const MeshPointer& meshPointer, dim3 blockSize( 256 ); dim3 gridSize; - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); Devices::Cuda::synchronizeDevice(); MeshTraverserBoundaryEntitiesKernel< EntitiesDimension, EntitiesProcessor > @@ -190,8 +192,8 @@ processInteriorEntities( const MeshPointer& meshPointer, dim3 blockSize( 256 ); dim3 gridSize; - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); Devices::Cuda::synchronizeDevice(); MeshTraverserInteriorEntitiesKernel< EntitiesDimension, EntitiesProcessor > @@ -221,8 +223,8 @@ processAllEntities( const MeshPointer& meshPointer, dim3 blockSize( 256 ); dim3 gridSize; - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = min( desGridSize, Devices::Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = min( desGridSize, Cuda::getNumberOfBlocks( entitiesCount, blockSize.x ) ); Devices::Cuda::synchronizeDevice(); MeshTraverserAllEntitiesKernel< EntitiesDimension, EntitiesProcessor > diff --git a/src/TNL/Object.h b/src/TNL/Object.h index ba41510953c976ab4730084802a014d80108d267..ff7432635a13379c83bf49af758f1da7dd8f6b28 100644 --- a/src/TNL/Object.h +++ b/src/TNL/Object.h @@ -12,7 +12,6 @@ #include <vector> -#include <TNL/Devices/CudaCallable.h> #include <TNL/String.h> #include <TNL/File.h> diff --git a/src/TNL/ParallelFor.h b/src/TNL/ParallelFor.h index 04af2740807b9139ca0b8452b9c1b7bc52f5a8c8..cc9ce7080679a2a6fbdddd067095eb110a3d1c2a 100644 --- a/src/TNL/ParallelFor.h +++ b/src/TNL/ParallelFor.h @@ -12,7 +12,9 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> -#include <TNL/Devices/CudaDeviceInfo.h> +#include <TNL/Cuda/CheckDevice.h> +#include <TNL/Cuda/DeviceInfo.h> +#include <TNL/Cuda/LaunchHelpers.h> #include <TNL/Math.h> /**** @@ -203,14 +205,14 @@ struct ParallelFor< Devices::Cuda, Mode > if( end > start ) { dim3 blockSize( 256 ); dim3 gridSize; - gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); + gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); if( (std::size_t) blockSize.x * gridSize.x >= (std::size_t) end - start ) ParallelForKernel< false ><<< gridSize, blockSize >>>( start, end, f, args... ); else { // decrease the grid size and align to the number of multiprocessors - const int desGridSize = 32 * Devices::CudaDeviceInfo::getCudaMultiprocessors( Devices::CudaDeviceInfo::getActiveDevice() ); - gridSize.x = TNL::min( desGridSize, Devices::Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); + const int desGridSize = 32 * Cuda::DeviceInfo::getCudaMultiprocessors( Cuda::DeviceInfo::getActiveDevice() ); + gridSize.x = TNL::min( desGridSize, Cuda::getNumberOfBlocks( end - start, blockSize.x ) ); ParallelForKernel< true ><<< gridSize, blockSize >>>( start, end, f, args... ); } @@ -253,8 +255,8 @@ struct ParallelFor2D< Devices::Cuda, Mode > blockSize.y = TNL::min( 8, sizeY ); } dim3 gridSize; - gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) ); - gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); + gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeX, blockSize.x ) ); + gridSize.y = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); dim3 gridCount; gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); @@ -337,9 +339,9 @@ struct ParallelFor3D< Devices::Cuda, Mode > blockSize.z = TNL::min( 4, sizeZ ); } dim3 gridSize; - gridSize.x = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeX, blockSize.x ) ); - gridSize.y = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); - gridSize.z = TNL::min( Devices::Cuda::getMaxGridSize(), Devices::Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) ); + gridSize.x = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeX, blockSize.x ) ); + gridSize.y = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeY, blockSize.y ) ); + gridSize.z = TNL::min( Cuda::getMaxGridSize(), Cuda::getNumberOfBlocks( sizeZ, blockSize.z ) ); dim3 gridCount; gridCount.x = roundUpDivision( sizeX, blockSize.x * gridSize.x ); diff --git a/src/TNL/Pointers/DevicePointer.h b/src/TNL/Pointers/DevicePointer.h index 136c809cda7b8cbd3ad87957eca4f37f5bc6ff88..d3bf9b07f9b2c59030433e1bca633a2601bb73d0 100644 --- a/src/TNL/Pointers/DevicePointer.h +++ b/src/TNL/Pointers/DevicePointer.h @@ -17,6 +17,7 @@ #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> #include <TNL/TypeInfo.h> +#include <TNL/Cuda/MemoryHelpers.h> #include <cstring> // std::memcpy, std::memcmp @@ -422,7 +423,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer this->pointer = &obj; this->pd = new PointerData(); // pass to device - this->cuda_pointer = Devices::Cuda::passToDevice( *this->pointer ); + this->cuda_pointer = Cuda::passToDevice( *this->pointer ); // set last-sync state this->set_last_sync_state(); Devices::Cuda::insertSmartPointer( this ); @@ -456,7 +457,7 @@ class DevicePointer< Object, Devices::Cuda > : public SmartPointer delete this->pd; this->pd = nullptr; if( this->cuda_pointer ) - Devices::Cuda::freeFromDevice( this->cuda_pointer ); + Cuda::freeFromDevice( this->cuda_pointer ); } } } diff --git a/src/TNL/Pointers/SharedPointerCuda.h b/src/TNL/Pointers/SharedPointerCuda.h index 9c883c23a7c5f1c73e6861beb96bfafd61bd7571..27a4aabef4ed7e1383ae5f141e1c7e56a6f5d350 100644 --- a/src/TNL/Pointers/SharedPointerCuda.h +++ b/src/TNL/Pointers/SharedPointerCuda.h @@ -16,6 +16,7 @@ #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> +#include <TNL/Cuda/MemoryHelpers.h> #include <cstring> // std::memcpy, std::memcmp #include <cstddef> // std::nullptr_t @@ -570,7 +571,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer { this->pd = new PointerData( args... ); // pass to device - this->cuda_pointer = Devices::Cuda::passToDevice( this->pd->data ); + this->cuda_pointer = Cuda::passToDevice( this->pd->data ); // set last-sync state this->set_last_sync_state(); #ifdef TNL_DEBUG_SHARED_POINTERS @@ -608,7 +609,7 @@ class SharedPointer< Object, Devices::Cuda > : public SmartPointer delete this->pd; this->pd = nullptr; if( this->cuda_pointer ) - Devices::Cuda::freeFromDevice( this->cuda_pointer ); + Cuda::freeFromDevice( this->cuda_pointer ); #ifdef TNL_DEBUG_SHARED_POINTERS std::cerr << "...deleted data." << std::endl; #endif diff --git a/src/TNL/Pointers/SharedPointerHost.h b/src/TNL/Pointers/SharedPointerHost.h index 087cfd79e9fbe77ac7e4c9a5a2e3a8c2cce5fe5c..39a6d4da4a2b8ab8b964110173d1716fade1ac71 100644 --- a/src/TNL/Pointers/SharedPointerHost.h +++ b/src/TNL/Pointers/SharedPointerHost.h @@ -15,7 +15,7 @@ #include "SharedPointer.h" #include <TNL/Devices/Host.h> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Pointers/SmartPointer.h> #include <cstddef> // std::nullptr_t diff --git a/src/TNL/Pointers/UniquePointer.h b/src/TNL/Pointers/UniquePointer.h index 53f2dac5b87a012354fe06675efafcecdab48dfc..9a7a2b677769fde29d49419141a76cd9cd6f4373 100644 --- a/src/TNL/Pointers/UniquePointer.h +++ b/src/TNL/Pointers/UniquePointer.h @@ -16,6 +16,7 @@ #include <TNL/Devices/Host.h> #include <TNL/Devices/Cuda.h> #include <TNL/Pointers/SmartPointer.h> +#include <TNL/Cuda/MemoryHelpers.h> #include <cstring> // std::memcpy, std::memcmp #include <cstddef> // std::nullptr_t @@ -272,7 +273,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer { this->pd = new PointerData( args... ); // pass to device - this->cuda_pointer = Devices::Cuda::passToDevice( this->pd->data ); + this->cuda_pointer = Cuda::passToDevice( this->pd->data ); // set last-sync state this->set_last_sync_state(); Devices::Cuda::insertSmartPointer( this ); @@ -300,7 +301,7 @@ class UniquePointer< Object, Devices::Cuda > : public SmartPointer if( this->pd ) delete this->pd; if( this->cuda_pointer ) - Devices::Cuda::freeFromDevice( this->cuda_pointer ); + Cuda::freeFromDevice( this->cuda_pointer ); } PointerData* pd; diff --git a/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h b/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h index a28a64cf5567eaee40f5d4efca3fd24af3dd2819..d1b871c25c0a998b68a770d4c0629ad76d20dfb5 100644 --- a/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h +++ b/src/TNL/Solvers/PDE/BoundaryConditionsSetter.h @@ -11,7 +11,7 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> #include <TNL/Functions/FunctionAdapter.h> #include <TNL/Pointers/SharedPointer.h> #include <TNL/Meshes/Traverser.h> diff --git a/src/TNL/StaticFor.h b/src/TNL/StaticFor.h index 990036dfc0090708851468e03e991a30a07cc835..c37763aaa9fb5ae740755b1b8a3750c71b92ae65 100644 --- a/src/TNL/StaticFor.h +++ b/src/TNL/StaticFor.h @@ -10,7 +10,7 @@ #pragma once -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { diff --git a/src/TNL/TemplateStaticFor.h b/src/TNL/TemplateStaticFor.h index 88ad764fd9b78d0348469a115ee1cb83ecb7993b..efd9d1ad9c686a6ff9780656ce3cd0d9d85727e0 100644 --- a/src/TNL/TemplateStaticFor.h +++ b/src/TNL/TemplateStaticFor.h @@ -13,7 +13,7 @@ #include <utility> #include <type_traits> -#include <TNL/Devices/CudaCallable.h> +#include <TNL/Cuda/CudaCallable.h> namespace TNL { namespace detail { diff --git a/src/UnitTests/AssertCudaTest.cu b/src/UnitTests/AssertCudaTest.cu index 9d4865eb9c8ba3b7aaa6f8bf5506fabe17be4483..8f42da6772dc2b4b6dbe3f185ca96bc8efde9290 100644 --- a/src/UnitTests/AssertCudaTest.cu +++ b/src/UnitTests/AssertCudaTest.cu @@ -13,7 +13,7 @@ #endif #include <TNL/Assert.h> -#include <TNL/Devices/Cuda.h> +#include <TNL/Cuda/CheckDevice.h> #include <TNL/Exceptions/CudaRuntimeError.h> #ifdef HAVE_GTEST diff --git a/src/UnitTests/Containers/ArrayTest.h b/src/UnitTests/Containers/ArrayTest.h index 25c7fda4935dbfcd5d064d22bcce86adfddc4b2e..69dd2d25227202a4391aebb0054dae8acf8dfa26 100644 --- a/src/UnitTests/Containers/ArrayTest.h +++ b/src/UnitTests/Containers/ArrayTest.h @@ -15,6 +15,7 @@ #include <TNL/Containers/Array.h> #include <TNL/Containers/Vector.h> +#include <TNL/Pointers/DevicePointer.h> #include "gtest/gtest.h" @@ -312,9 +313,9 @@ void testArrayElementwiseAccess( Array< Value, Devices::Cuda, Index >&& u ) #ifdef HAVE_CUDA u.setSize( 10 ); using ArrayType = Array< Value, Devices::Cuda, Index >; - ArrayType* kernel_u = Devices::Cuda::passToDevice( u ); - testSetGetElementKernel<<< 1, 16 >>>( kernel_u ); - Devices::Cuda::freeFromDevice( kernel_u ); + Pointers::DevicePointer< ArrayType > kernel_u( u ); + testSetGetElementKernel<<< 1, 16 >>>( &kernel_u.template modifyData< Devices::Cuda >() ); + cudaDeviceSynchronize(); TNL_CHECK_CUDA_DEVICE; for( int i = 0; i < 10; i++ ) { EXPECT_EQ( u.getElement( i ), i );