Implementing the CUDA support for the dense matrix format. (d5a890c8) · Commits · TNL / tnl-dev

src/core/tnlCuda.h

+9 −11

Original line number	Diff line number	Diff line
		@@ -33,15 +33,17 @@ class tnlCuda
		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		static inline tnlDeviceEnum getDevice() { return tnlCudaDevice; };
		static inline tnlDeviceEnum getDevice();

		static int getMaxGridSize();

		static void setMaxGridSize( int newMaxGridSize );

		static int getMaxBlockSize();
		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		static inline int getMaxGridSize();

		static void setMaxBlockSize( int newMaxBlockSize );
		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		static inline int getMaxBlockSize();

		static int getGPUTransferBufferSize();

		@@ -64,10 +66,6 @@ class tnlCuda


		static bool checkDevice( const char* file_name, int line );

		protected:

		static int maxGridSize, maxBlockSize;
		};

		#define checkCudaDevice tnlCuda::checkDevice( __FILE__, __LINE__ )

src/implementation/core/arrays/tnlSharedArray_impl.h

+1 −0

Original line number	Diff line number	Diff line
		@@ -376,6 +376,7 @@ ostream& operator << ( ostream& str, const tnlSharedArray< Element, Device, Inde
		str << ", " << v. getElement( i );
		}
		str << " ]";
		return str;
		}

		//}; // namespace implementation

src/implementation/core/tnlCuda.cpp

+0 −24

Original line number	Diff line number	Diff line
		@@ -18,35 +18,11 @@
		#include <core/tnlCuda.h>
		#include <tnlConfig.h>

		int tnlCuda :: maxGridSize = maxCudaGridSize;

		int tnlCuda :: maxBlockSize = maxCudaBlockSize;

		tnlString tnlCuda :: getDeviceType()
		{
		return tnlString( "tnlCuda" );
		}

		int tnlCuda :: getMaxGridSize()
		{
		return maxGridSize;
		}

		void tnlCuda :: setMaxGridSize( int newMaxGridSize )
		{
		maxGridSize = newMaxGridSize;
		}

		int tnlCuda :: getMaxBlockSize()
		{
		return maxBlockSize;
		}

		void tnlCuda :: setMaxBlockSize( int newMaxBlockSize )
		{
		maxBlockSize = newMaxBlockSize;
		}

		int tnlCuda::getGPUTransferBufferSize()
		{
		return 1 << 20;

src/implementation/core/tnlCuda_impl.h

+27 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,33 @@

		#ifdef HAVE_CUDA

		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		inline tnlDeviceEnum tnlCuda::getDevice()
		{
		return tnlCudaDevice;
		};

		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		inline int tnlCuda::getMaxGridSize()
		{
		// TODO: make it preprocessor macro constant defined in tnlConfig
		return 65536;
		};

		#ifdef HAVE_CUDA
		__host__ __device__
		#endif
		inline int tnlCuda::getMaxBlockSize()
		{
		// TODO: make it preprocessor macro constant defined in tnlConfig
		return 1024;
		};


		template< typename ObjectType >
		ObjectType* tnlCuda::passToDevice( const ObjectType& object )
		{

src/implementation/matrices/tnlDenseMatrix_impl.h

+138 −32

Original line number	Diff line number	Diff line
		@@ -117,6 +117,15 @@ void tnlDenseMatrix< Real, Device, Index >::reset()
		this->values.reset();
		}

		template< typename Real,
		typename Device,
		typename Index >
		void tnlDenseMatrix< Real, Device, Index >::setValue( const Real& value )
		{
		this->values.setValue( value );
		}


		template< typename Real,
		typename Device,
		typename Index >
		@@ -160,6 +169,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row,
		else
		this->values.operator[]( elementIndex ) =
		thisElementMultiplicator * this->values.operator[]( elementIndex ) + value;
		return true;
		}

		template< typename Real,
		@@ -177,6 +187,7 @@ bool tnlDenseMatrix< Real, Device, Index >::addElement( const IndexType row,
		else
		this->values.setElement( elementIndex,
		thisElementMultiplicator * this->values.getElement( elementIndex ) + value );
		return true;
		}


		@@ -316,15 +327,33 @@ template< typename Real,
		typename Device,
		typename Index >
		template< typename Vector >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		typename Vector::RealType tnlDenseMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
		const Vector& vector ) const
		{
		RealType sum( 0.0 );
		for( IndexType column = 0; column < this->getColumns(); column++ )
		sum += this->getElement( row, column ) * vector.getElement( column );
		sum += this->getElementFast( row, column ) * vector[ column ];
		return sum;
		}

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		typename Vector >
		__global__ void tnlDenseMatrixVectorProductCudaKernel( tnlDenseMatrix< Real, tnlCuda, Index >* matrix,
		const Vector* inVector,
		Vector* outVector,
		const Index gridIdx )
		{
		const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx < matrix->getRows() )
		( outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, inVector );
		}
		#endif

		template< typename Real,
		typename Device,
		typename Index >
		@@ -343,8 +372,31 @@ void tnlDenseMatrix< Real, Device, Index >::vectorProduct( const Vector& inVecto
		<< "Vector size: " << outVector.getSize() << endl
		<< "Vector name: " << outVector.getName() << endl );

		if( Device::getDevice() == tnlHostDevice )
		for( IndexType row = 0; row < this->getRows(); row++ )
		outVector[ row ] = rowVectorProduct( row, inVector );
		if( Device::getDevice() == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		ThisType* kernel_this = tnlCuda::passToDevice( *this );
		Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
		Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlDenseMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
		}
		tnlCuda::freeFromDevice( kernel_this );
		tnlCuda::freeFromDevice( kernel_inVector );
		tnlCuda::freeFromDevice( kernel_outVector );
		checkCudaDevice;
		#endif
		}
		}

		template< typename Real,
		@@ -365,20 +417,29 @@ void tnlDenseMatrix< Real, Device, Index >::addMatrix( const Matrix& matrix,
		<< "That matrix name: " << matrix.getName() << endl );

		if( thisMatrixMultiplicator == 1.0 )
		{
		for( IndexType row = 0; row < this->getRows(); row++ )
		for( IndexType column = 0; column < this->getColumns(); column++ )
		this->operator()( row, column ) += matrixMultiplicator*matrix( row, column );
		}
		this->values.alphaXPlusY( matrixMultiplicator, matrix.values );
		else
		{
		for( IndexType row = 0; row < this->getRows(); row++ )
		for( IndexType column = 0; column < this->getColumns(); column++ )
		this->operator()( row, column ) =
		thisMatrixMultiplicator * this->operator()( row, column) +
		matrixMultiplicator * matrix( row, column );
		this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator );
		}

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		typename Matrix1,
		typename Matrix2,
		int tileDim,
		int tileRowBlockSize >
		__global__ void tnlDenseMatrixMatrixProductKernel( tnlDenseMatrix< Real, tnlCuda, Index >* reusltMatrix,
		const Matrix1* matrix1,
		const Matrix2* matrix2,
		const Real matrix1Multiplicator,
		const Real matrix2Multiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y )
		{

		}
		#endif

		template< typename Real,
		typename Device,
		@@ -402,6 +463,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
		<< "Matrix2 rows: " << matrix2.getRows() << endl
		<< "Matrix2 name: " << matrix2.getName() << endl );

		if( Device::getDevice() == tnlHostDevice )
		for( IndexType i = 0; i < this->getRows(); i += tileDim )
		for( IndexType j = 0; j < this->getColumns(); j += tileDim )
		{
		@@ -409,7 +471,7 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
		const IndexType tileColumns = Min( tileDim, this->getColumns() - j );
		for( IndexType i1 = i; i1 < i + tileRows; i1++ )
		for( IndexType j1 = j; j1 < j + tileColumns; j1++ )
		this->operator()( i1, j1 ) = 0.0;
		this->setElementFast( i1, j1, 0.0 );

		for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim )
		{
		@@ -417,9 +479,53 @@ void tnlDenseMatrix< Real, Device, Index >::getMatrixProduct( const Matrix1& mat
		for( IndexType i1 = 0; i1 < tileRows; i1++ )
		for( IndexType j1 = 0; j1 < tileColumns; j1++ )
		for( IndexType k1 = k; k1 < lastK; k1++ )
		this->operator()( i + i1, j + j1 ) +=
		matrix1( i + i1, k1 ) * matrix2( k1, j + j1 );
		this->addElementFast( i + i1, j + j1,
		matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) );
		}
		}
		if( Device::getDevice() == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
		const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
		const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
		cudaBlockSize.x = blockXSize;
		cudaBlockSize.y = blockYSize;
		const IndexType rowGrids = roundUpDivision( rowTiles, tnlCuda::getMaxGridSize() );
		const IndexType columnGrids = roundUpDivision( columnTiles, tnlCuda::getMaxGridSize() );

		for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
		for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
		{
		cudaGridSize.x = cudaGridSize.y = tnlCuda::getMaxGridSize();
		if( gridIdx_x == columnGrids - 1 )
		cudaGridSize.x = columnTiles % tnlCuda::getMaxGridSize();
		if( gridIdx_y == rowGrids - 1 )
		cudaGridSize.y = rowTiles % tnlCuda::getMaxGridSize();
		ThisType* this_kernel = tnlCuda::passToDevice( *this );
		Matrix1* matrix1_kernel = tnlCuda::passToDevice( matrix1 );
		Matrix2* matrix2_kernel = tnlCuda::passToDevice( matrix2 );
		tnlDenseMatrixMatrixProductKernel< Real,
		Index,
		Matrix1,
		Matrix2,
		tileDim,
		tileRowBlockSize >
		<<< cudaGridSize,
		cudaBlockSize,
		3tileDimtileDim >>>
		( this_kernel,
		matrix1_kernel,
		matrix2_kernel,
		matrix1Multiplicator,
		matrix2Multiplicator,
		gridIdx_x,
		gridIdx_y );
		tnlCuda::freeFromDevice( this_kernel );
		tnlCuda::freeFromDevice( matrix1_kernel );
		tnlCuda::freeFromDevice( matrix2_kernel );
		}
		#endif
		}
		}