Implemented helper functions for setup of CUDA grids. (d359d3d1) · Commits · TNL / tnl-dev

src/TNL/Devices/Cuda.cu

+101 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,101 @@
		namespace TNL {
		namespace Devices {


		void Cuda::setupThreads( const dim3& blockSize,
		dim3& blocksCount,
		dim3& gridsCount,
		long long int xThreads,
		long long int yThreads,
		long long int zThreads )
		{
		if( blockSize.x )
		blocksCount.x = xThreads / blockSize.x + ( xThreads % blockSize.x != 0 );
		else
		blocksCount.x = 0;
		if( blockSize.y )
		blocksCount.y = yThreads / blockSize.y + ( yThreads % blockSize.y != 0 );
		else
		blocksCount.y = 0;
		if( blockSize.z )
		blocksCount.z = xThreads / blockSize.z + ( zThreads % blockSize.z != 0 );
		else
		blocksCount.z = 0;

		/****
		* TODO: Fix the following:
		* I do not known how to get max grid size in kernels :(
		*
		int currentDevice( 0 );
		cudaGetDevice( currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );
		gridsCount.x = blocksCount.x / properties.maxGridSize[ 0 ] + ( blocksCount.x % properties.maxGridSize[ 0 ] != 0 );
		gridsCount.y = blocksCount.y / properties.maxGridSize[ 1 ] + ( blocksCount.y % properties.maxGridSize[ 1 ] != 0 );
		gridsCount.z = blocksCount.z / properties.maxGridSize[ 2 ] + ( blocksCount.z % properties.maxGridSize[ 2 ] != 0 );
		*/
		gridsCount.x = blocksCount.x / getMaxGridSize() + ( blocksCount.x % getMaxGridSize() != 0 );
		gridsCount.y = blocksCount.y / getMaxGridSize() + ( blocksCount.y % getMaxGridSize() != 0 );
		gridsCount.z = blocksCount.z / getMaxGridSize() + ( blocksCount.z % getMaxGridSize() != 0 );
		}

		void Cuda::setupGrid( const dim3& blocksCount,
		const dim3& gridsCount,
		const dim3& gridIdx,
		dim3& gridSize )
		{
		int currentDevice( 0 );
		cudaGetDevice( &currentDevice );
		cudaDeviceProp properties;
		cudaGetDeviceProperties( &properties, currentDevice );

		/****
		* TODO: fix the following
		if( gridIdx.x < gridsCount.x )
		gridSize.x = properties.maxGridSize[ 0 ];
		else
		gridSize.x = blocksCount.x % properties.maxGridSize[ 0 ];

		if( gridIdx.y < gridsCount.y )
		gridSize.y = properties.maxGridSize[ 1 ];
		else
		gridSize.y = blocksCount.y % properties.maxGridSize[ 1 ];

		if( gridIdx.z < gridsCount.z )
		gridSize.z = properties.maxGridSize[ 2 ];
		else
		gridSize.z = blocksCount.z % properties.maxGridSize[ 2 ];*/

		if( gridIdx.x < gridsCount.x )
		gridSize.x = getMaxGridSize();
		else
		gridSize.x = blocksCount.x % getMaxGridSize();

		if( gridIdx.y < gridsCount.y )
		gridSize.y = getMaxGridSize();
		else
		gridSize.y = blocksCount.y % getMaxGridSize();

		if( gridIdx.z < gridsCount.z )
		gridSize.z = getMaxGridSize();
		else
		gridSize.z = blocksCount.z % getMaxGridSize();

		}

		void Cuda::printThreadsSetup( const dim3& blockSize,
		const dim3& blocksCount,
		const dim3& gridSize,
		const dim3& gridsCount,
		std::ostream& str )
		{
		/*str << "Block size: " << blockSize << std::endl
		<< " Blocks count: " << blocksCount << std::endl
		<< " Grid size: " << gridSize << std::endl
		<< " Grids count: " << gridsCount << std::endl;*/
		}


		bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
		{
		if( error == cudaSuccess )
		@@ -397,5 +492,11 @@ bool Cuda::checkDevice( const char* file_name, int line, cudaError error )
		return false;
		}

		std::ostream& operator << ( std::ostream& str, const dim3& d )
		{
		str << "( " << d.x << ", " << d.y << ", " << d.z << " )";
		return str;
		}

		} // namespace Devices
		} // namespace TNL

src/TNL/Devices/Cuda.h

+52 −1

Original line number	Diff line number	Diff line
		@@ -49,17 +49,67 @@ class Cuda
		static inline constexpr int getGPUTransferBufferSize();

		#ifdef HAVE_CUDA
		/***
		* This function is obsolete and should be replaced by the following functions.
		*/
		__device__ static inline int
		getGlobalThreadIdx( const int gridIdx = 0,
		const int gridSize = getMaxGridSize() );

		__device__ static inline int
		getGlobalThreadIdx_x( const dim3& gridIdx );

		__device__ static inline int
		getGlobalThreadIdx_y( const dim3& gridIdx );

		__device__ static inline int
		getGlobalThreadIdx_z( const dim3& gridIdx );
		#endif

		/****
		* This functions helps to count number of CUDA blocks depending on the
		* number of the CUDA threads and the block size.
		* It is obsolete and it will be replaced by setupThreads.
		*/
		static int getNumberOfBlocks( const int threads,
		const int blockSize );

		/****
		* This functions helps to count number of CUDA grids depending on the
		* number of the CUDA blocks and maximum grid size.
		* It is obsolete and it will be replaced by setupThreads.
		*/
		static int getNumberOfGrids( const int blocks,
		const int gridSize = getMaxGridSize() );

		#ifdef HAVE_CUDA
		/*! This method sets up gridSize and computes number of grids depending
		* on total number of CUDA threads.
		*/
		static void setupThreads( const dim3& blockSize,
		dim3& blocksCount,
		dim3& gridsCount,
		long long int xThreads,
		long long int yThreads = 0,
		long long int zThreads = 0 );

		/*! This method sets up grid size when one iterates over more grids.
		* If gridIdx.? < gridsCount.? then the gridSize.? is set to maximum
		* allowed by CUDA. Otherwise gridSize.? is set to the size of the grid
		* in the last loop i.e. blocksCount.? % maxGridSize.?.
		*/
		static void setupGrid( const dim3& blocksCount,
		const dim3& gridsCount,
		const dim3& gridIdx,
		dim3& gridSize );

		static void printThreadsSetup( const dim3& blockSize,
		const dim3& blocksCount,
		const dim3& gridSize,
		const dim3& gridsCount,
		std::ostream& str = std::cout );
		#endif

		template< typename ObjectType >
		static ObjectType* passToDevice( const ObjectType& object );

		@@ -135,6 +185,7 @@ class Cuda

		#ifdef HAVE_CUDA
		#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice( __FILE__, __LINE__, cudaGetLastError() )
		std::ostream& operator << ( std::ostream& str, const dim3& d );
		#else
		#define checkCudaDevice ::TNL::Devices::Cuda::checkDevice()
		#endif

src/TNL/Devices/Cuda_impl.h

+16 −0

Original line number	Diff line number	Diff line
		@@ -53,6 +53,22 @@ __device__ inline int Cuda::getGlobalThreadIdx( const int gridIdx, const int gri
		{
		return ( gridIdx * gridSize + blockIdx.x ) * blockDim.x + threadIdx.x;
		}

		__device__ inline int Cuda::getGlobalThreadIdx_x( const dim3& gridIdx )
		{
		return ( gridIdx.x * getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		}

		__device__ inline int Cuda::getGlobalThreadIdx_y( const dim3& gridIdx )
		{
		return ( gridIdx.y * getMaxGridSize() + blockIdx.y ) * blockDim.y + threadIdx.y;
		}

		__device__ inline int Cuda::getGlobalThreadIdx_z( const dim3& gridIdx )
		{
		return ( gridIdx.z * getMaxGridSize() + blockIdx.z ) * blockDim.z + threadIdx.z;
		}

		#endif

src/TNL/Meshes/GridDetails/GridTraverser_impl.h

+20 −12

Original line number	Diff line number	Diff line
		@@ -357,13 +357,13 @@ GridTraverser2DBoundaryAlongX(
		const Index beginX,
		const Index endX,
		const Index fixedY,
		const Index gridIdx,
		const dim3 gridIdx,
		const GridEntityParameters... gridEntityParameters )
		{
		typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
		typename GridType::CoordinatesType coordinates;

		coordinates.x() = beginX + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.x() = beginX + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );
		coordinates.y() = fixedY;

		if( coordinates.x() <= endX )
		@@ -391,14 +391,14 @@ GridTraverser2DBoundaryAlongY(
		const Index beginY,
		const Index endY,
		const Index fixedX,
		const Index gridIdx,
		const dim3 gridIdx,
		const GridEntityParameters... gridEntityParameters )
		{
		typedef Meshes::Grid< 2, Real, Devices::Cuda, Index > GridType;
		typename GridType::CoordinatesType coordinates;

		coordinates.x() = fixedX;
		coordinates.y() = beginY + ( gridIdx * Devices::Cuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		coordinates.y() = beginY + Devices::Cuda::getGlobalThreadIdx_x( gridIdx );

		if( coordinates.y() <= endY )
		{
		@@ -438,23 +438,30 @@ processEntities(
		( GridEntity::getDimensions() == 2 \|\| GridEntity::getDimensions() == 0 ) )
		{
		dim3 cudaBlockSize( 256 );
		const IndexType entitiesAlongX = end.x() - begin.x() + 1;
		dim3 cudaBlocksCountAlongX, cudaGridsCountAlongX,
		cudaBlocksCountAlongY, cudaGridsCountAlongY;
		Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongX, cudaGridsCountAlongX, end.x() - begin.x() + 1 );
		Devices::Cuda::setupThreads( cudaBlockSize, cudaBlocksCountAlongY, cudaGridsCountAlongY, end.y() - begin.y() - 1 );

		/*const IndexType entitiesAlongX = end.x() - begin.x() + 1;
		const IndexType entitiesAlongY = end.y() - begin.y() - 1;
		dim3 cudaBlocksAlongX, cudaBlocksAlongY;
		cudaBlocksAlongX.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongX, cudaBlockSize.x );
		cudaBlocksAlongY.x = Devices::Cuda::getNumberOfBlocks( entitiesAlongY, cudaBlockSize.x );
		const IndexType cudaGridsAlongX = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongX.x );
		const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );
		const IndexType cudaGridsAlongY = Devices::Cuda::getNumberOfGrids( cudaBlocksAlongY.x );*/

		auto& pool = CudaStreamPool::getInstance();
		Devices::Cuda::synchronizeDevice();

		const cudaStream_t& s1 = pool.getStream( stream );
		const cudaStream_t& s2 = pool.getStream( stream + 1 );
		for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ )
		dim3 gridIdx, cudaGridSize;
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongX.x; gridIdx.x++ )
		{
		Devices::Cuda::setupGrid( cudaBlocksCountAlongX, cudaGridsCountAlongX, gridIdx, cudaGridSize );
		GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaBlocksAlongX, cudaBlockSize, 0, s1 >>>
		<<< cudaGridSize, cudaBlockSize, 0, s1 >>>
		( &gridPointer.template getData< Devices::Cuda >(),
		&userDataPointer.template modifyData< Devices::Cuda >(),
		begin.x(),
		@@ -463,7 +470,7 @@ processEntities(
		gridIdx,
		gridEntityParameters... );
		GridTraverser2DBoundaryAlongX< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaBlocksAlongX, cudaBlockSize, 0, s2 >>>
		<<< cudaGridSize, cudaBlockSize, 0, s2 >>>
		( &gridPointer.template getData< Devices::Cuda >(),
		&userDataPointer.template modifyData< Devices::Cuda >(),
		begin.x(),
		@@ -474,10 +481,11 @@ processEntities(
		}
		const cudaStream_t& s3 = pool.getStream( stream + 2 );
		const cudaStream_t& s4 = pool.getStream( stream + 3 );
		for( IndexType gridIdx = 0; gridIdx < cudaGridsAlongX; gridIdx++ )
		for( gridIdx.x = 0; gridIdx.x < cudaGridsCountAlongY.x; gridIdx.x++ )
		{
		Devices::Cuda::setupGrid( cudaBlocksCountAlongY, cudaGridsCountAlongY, gridIdx, cudaGridSize );
		GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaBlocksAlongY, cudaBlockSize, 0, s3 >>>
		<<< cudaGridSize, cudaBlockSize, 0, s3 >>>
		( &gridPointer.template getData< Devices::Cuda >(),
		&userDataPointer.template modifyData< Devices::Cuda >(),
		begin.y() + 1,
		@@ -486,7 +494,7 @@ processEntities(
		gridIdx,
		gridEntityParameters... );
		GridTraverser2DBoundaryAlongY< Real, Index, GridEntity, UserData, EntitiesProcessor, processOnlyBoundaryEntities, GridEntityParameters... >
		<<< cudaBlocksAlongY, cudaBlockSize, 0, s4 >>>
		<<< cudaGridSize, cudaBlockSize, 0, s4 >>>
		( &gridPointer.template getData< Devices::Cuda >(),
		&userDataPointer.template modifyData< Devices::Cuda >(),
		begin.y() + 1,