Moved (most of) static methods from TNL::Devices::Cuda as free functions into... (2d5176fb) · Commits · TNL / tnl-dev

src/Benchmarks/BLAS/spmv.h

+4 −4

Original line number	Diff line number	Diff line
		@@ -53,7 +53,7 @@ __global__ void setCudaTestMatrixKernel( Matrix* matrix,
		const int elementsPerRow,
		const int gridIdx )
		{
		const int rowIdx = ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		const int rowIdx = ( gridIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx >= matrix->getRows() )
		return;
		int col = rowIdx - elementsPerRow / 2;
		@@ -73,12 +73,12 @@ void setCudaTestMatrix( Matrix& matrix,
		typedef typename Matrix::IndexType IndexType;
		typedef typename Matrix::RealType RealType;
		Pointers::DevicePointer< Matrix > kernel_matrix( matrix );
		dim3 cudaBlockSize( 256 ), cudaGridSize( Devices::Cuda::getMaxGridSize() );
		dim3 cudaBlockSize( 256 ), cudaGridSize( Cuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Devices::Cuda::getMaxGridSize() );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, Cuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ ) {
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % Devices::Cuda::getMaxGridSize();
		cudaGridSize.x = cudaBlocks % Cuda::getMaxGridSize();
		setCudaTestMatrixKernel< Matrix >
		<<< cudaGridSize, cudaBlockSize >>>
		( &kernel_matrix.template modifyData< Devices::Cuda >(), elementsPerRow, gridIdx );

src/Benchmarks/Benchmarks.h

+10 −10

Original line number	Diff line number	Diff line
		@@ -24,7 +24,7 @@

		#include <TNL/Devices/Host.h>
		#include <TNL/Devices/SystemInfo.h>
		#include <TNL/Devices/CudaDeviceInfo.h>
		#include <TNL/Cuda/DeviceInfo.h>
		#include <TNL/Config/ConfigDescription.h>
		#include <TNL/Communicators/MpiCommunicator.h>

		@@ -339,9 +339,9 @@ Benchmark::MetadataMap getHardwareMetadata()
		+ convertToString( cacheSizes.L2 ) + ", "
		+ convertToString( cacheSizes.L3 );
		#ifdef HAVE_CUDA
		const int activeGPU = Devices::CudaDeviceInfo::getActiveDevice();
		const String deviceArch = convertToString( Devices::CudaDeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
		convertToString( Devices::CudaDeviceInfo::getArchitectureMinor( activeGPU ) );
		const int activeGPU = Cuda::DeviceInfo::getActiveDevice();
		const String deviceArch = convertToString( Cuda::DeviceInfo::getArchitectureMajor( activeGPU ) ) + "." +
		convertToString( Cuda::DeviceInfo::getArchitectureMinor( activeGPU ) );
		#endif
		Benchmark::MetadataMap metadata {
		{ "host name", Devices::SystemInfo::getHostname() },
		@@ -362,13 +362,13 @@ Benchmark::MetadataMap getHardwareMetadata()
		{ "CPU max frequency (MHz)", convertToString( Devices::SystemInfo::getCPUMaxFrequency( cpu_id ) / 1e3 ) },
		{ "CPU cache sizes (L1d, L1i, L2, L3) (kiB)", cacheInfo },
		#ifdef HAVE_CUDA
		{ "GPU name", Devices::CudaDeviceInfo::getDeviceName( activeGPU ) },
		{ "GPU name", Cuda::DeviceInfo::getDeviceName( activeGPU ) },
		{ "GPU architecture", deviceArch },
		{ "GPU CUDA cores", convertToString( Devices::CudaDeviceInfo::getCudaCores( activeGPU ) ) },
		{ "GPU clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
		{ "GPU global memory (GB)", convertToString( (double) Devices::CudaDeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
		{ "GPU memory clock rate (MHz)", convertToString( (double) Devices::CudaDeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
		{ "GPU memory ECC enabled", convertToString( Devices::CudaDeviceInfo::getECCEnabled( activeGPU ) ) },
		{ "GPU CUDA cores", convertToString( Cuda::DeviceInfo::getCudaCores( activeGPU ) ) },
		{ "GPU clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getClockRate( activeGPU ) / 1e3 ) },
		{ "GPU global memory (GB)", convertToString( (double) Cuda::DeviceInfo::getGlobalMemory( activeGPU ) / 1e9 ) },
		{ "GPU memory clock rate (MHz)", convertToString( (double) Cuda::DeviceInfo::getMemoryClockRate( activeGPU ) / 1e3 ) },
		{ "GPU memory ECC enabled", convertToString( Cuda::DeviceInfo::getECCEnabled( activeGPU ) ) },
		#endif
		};

src/Benchmarks/HeatEquation/HeatEquationBenchmarkProblem_impl.h

+15 −15

Original line number	Diff line number	Diff line
		@@ -82,9 +82,9 @@ setup( const Config::ParameterContainer& parameters,

		if( std::is_same< DeviceType, Devices::Cuda >::value )
		{
		this->cudaBoundaryConditions = Devices::Cuda::passToDevice( *this->boundaryConditionPointer );
		this->cudaRightHandSide = Devices::Cuda::passToDevice( *this->rightHandSidePointer );
		this->cudaDifferentialOperator = Devices::Cuda::passToDevice( *this->differentialOperatorPointer );
		this->cudaBoundaryConditions = Cuda::passToDevice( *this->boundaryConditionPointer );
		this->cudaRightHandSide = Cuda::passToDevice( *this->rightHandSidePointer );
		this->cudaDifferentialOperator = Cuda::passToDevice( *this->differentialOperatorPointer );
		}
		this->explicitUpdater.setDifferentialOperator( this->differentialOperatorPointer );
		this->explicitUpdater.setBoundaryConditions( this->boundaryConditionPointer );
		@@ -266,8 +266,8 @@ boundaryConditionsTemplatedCompact( const GridType* grid,
		{
		typename GridType::CoordinatesType coordinates;

		coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
		coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;

		if( coordinates.x() < end.x() &&
		coordinates.y() < end.y() )
		@@ -357,8 +357,8 @@ heatEquationTemplatedCompact( const GridType* grid,
		typedef typename GridType::IndexType IndexType;
		typedef typename GridType::RealType RealType;

		coordinates.x() = begin.x() + ( gridXIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.y() = begin.y() + ( gridYIdx * Devices::Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
		coordinates.x() = begin.x() + ( gridXIdx * Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.y() = begin.y() + ( gridYIdx * Cuda::getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;

		MeshFunction& u = *_u;
		MeshFunction& fu = *_fu;
		@@ -483,10 +483,10 @@ getExplicitUpdate( const RealType& time,
		CellType cell( mesh.template getData< DeviceType >() );
		dim3 cudaBlockSize( 16, 16 );
		dim3 cudaBlocks;
		cudaBlocks.x = Devices::Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
		cudaBlocks.y = Devices::Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
		const IndexType cudaXGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.x );
		const IndexType cudaYGrids = Devices::Cuda::getNumberOfGrids( cudaBlocks.y );
		cudaBlocks.x = Cuda::getNumberOfBlocks( end.x() - begin.x() + 1, cudaBlockSize.x );
		cudaBlocks.y = Cuda::getNumberOfBlocks( end.y() - begin.y() + 1, cudaBlockSize.y );
		const IndexType cudaXGrids = Cuda::getNumberOfGrids( cudaBlocks.x );
		const IndexType cudaYGrids = Cuda::getNumberOfGrids( cudaBlocks.y );

		//std::cerr << "Setting boundary conditions..." << std::endl;

		@@ -762,10 +762,10 @@ template< typename Mesh,
		HeatEquationBenchmarkProblem< Mesh, BoundaryCondition, RightHandSide, DifferentialOperator, Communicator >::
		~HeatEquationBenchmarkProblem()
		{
		if( this->cudaMesh ) Devices::Cuda::freeFromDevice( this->cudaMesh );
		if( this->cudaBoundaryConditions ) Devices::Cuda::freeFromDevice( this->cudaBoundaryConditions );
		if( this->cudaRightHandSide ) Devices::Cuda::freeFromDevice( this->cudaRightHandSide );
		if( this->cudaDifferentialOperator ) Devices::Cuda::freeFromDevice( this->cudaDifferentialOperator );
		if( this->cudaMesh ) Cuda::freeFromDevice( this->cudaMesh );
		if( this->cudaBoundaryConditions ) Cuda::freeFromDevice( this->cudaBoundaryConditions );
		if( this->cudaRightHandSide ) Cuda::freeFromDevice( this->cudaRightHandSide );
		if( this->cudaDifferentialOperator ) Cuda::freeFromDevice( this->cudaDifferentialOperator );
		}

src/Benchmarks/HeatEquation/Tuning/GridTraverser.h

+1 −1

Original line number	Diff line number	Diff line
		@@ -12,7 +12,7 @@

		#include <TNL/Meshes/Grid.h>
		#include <TNL/Pointers/SharedPointer.h>
		#include <TNL/CudaStreamPool.h>
		#include <TNL/Cuda/StreamPool.h>

		namespace TNL {

src/Benchmarks/HeatEquation/Tuning/GridTraverser_impl.h

+11 −11

Original line number	Diff line number	Diff line
		@@ -126,8 +126,8 @@ _GridTraverser2D(
		typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
		typename GridType::CoordinatesType coordinates;

		coordinates.x() = begin.x() + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
		coordinates.y() = begin.y() + Devices::Cuda::getGlobalThreadIdx_y( gridIdx );
		coordinates.x() = begin.x() + Cuda::getGlobalThreadIdx_x( gridIdx );
		coordinates.y() = begin.y() + Cuda::getGlobalThreadIdx_y( gridIdx );

		if( coordinates <= end )
		{
		@@ -173,7 +173,7 @@ _GridTraverser2DBoundary(
		Index entitiesAlongX = endX - beginX + 1;
		Index entitiesAlongY = endY - beginY;

		Index threadId = Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
		Index threadId = Cuda::getGlobalThreadIdx_x( gridIdx );
		if( threadId < entitiesAlongX )
		{
		GridEntity entity( *grid,
		@@ -244,12 +244,12 @@ processEntities(
		dim3 cudaBlockSize( 256 );
		dim3 cudaBlocksCount, cudaGridsCount;
		IndexType cudaThreadsCount = 2 * ( end.x() - begin.x() + end.y() - begin.y() + 1 );
		Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
		Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount, cudaThreadsCount );
		dim3 gridIdx, cudaGridSize;
		Devices::Cuda::synchronizeDevice();
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x++ )
		{
		Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
		Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
		_GridTraverser2DBoundary< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaGridSize, cudaBlockSize >>>
		( &gridPointer.template getData< Devices::Cuda >(),
		@@ -266,11 +266,11 @@ processEntities(
		{
		dim3 cudaBlockSize( 16, 16 );
		dim3 cudaBlocksCount, cudaGridsCount;
		Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
		Cuda::setupThreads( cudaBlockSize, cudaBlocksCount, cudaGridsCount,
		end.x() - begin.x() + 1,
		end.y() - begin.y() + 1 );

		auto& pool = CudaStreamPool::getInstance();
		auto& pool = Cuda::StreamPool::getInstance();
		const cudaStream_t& s = pool.getStream( stream );

		Devices::Cuda::synchronizeDevice();
		@@ -278,8 +278,8 @@ processEntities(
		for( gridIdx.y = 0; gridIdx.y < cudaGridsCount.y; gridIdx.y ++ )
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCount.x; gridIdx.x ++ )
		{
		Devices::Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
		//Devices::Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
		Cuda::setupGrid( cudaBlocksCount, cudaGridsCount, gridIdx, cudaGridSize );
		//Cuda::printThreadsSetup( cudaBlockSize, cudaBlocksCount, cudaGridSize, cudaGridsCount );
		TNL::_GridTraverser2D< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaGridSize, cudaBlockSize, 0, s >>>
		( &gridPointer.template getData< Devices::Cuda >(),