Refactoring dense matrix-matrix multiplication. (d4ac2a56) · Commits · TNL / tnl-dev

src/TNL/File.hpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -83,7 +83,7 @@ template< typename Type,
		typename Allocator >
		void File::load( Type* buffer, std::streamsize elements )
		{
		static_assert( std::is_same< Type, typename Allocator::value_type >::value,
		static_assert( std::is_same< std::remove_cv_t< Type >, std::remove_cv_t< typename Allocator::value_type > >::value,
		"Allocator::value_type must be the same as Type." );
		TNL_ASSERT_GE( elements, 0, "Number of elements to load must be non-negative." );

src/TNL/Matrices/DenseMatrix.hpp

+7 −178

Original line number	Diff line number	Diff line
		@@ -460,107 +460,6 @@ addMatrix( const Matrix& matrix,
		this->values = thisMatrixMultiplicator * this->values + matrixMultiplicator * matrix.values;
		}

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		ElementsOrganization Organization,
		typename RealAllocator,
		typename Matrix1,
		typename Matrix2,
		int tileDim,
		int tileRowBlockSize >
		__global__ void DenseMatrixProductKernel( DenseMatrix< Real, Devices::Cuda, Index >* resultMatrix,
		const Matrix1* matrixA,
		const Matrix2* matrixB,
		const Real matrixAMultiplicator,
		const Real matrixBMultiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y )
		{
		/****
		* Here we compute product C = A * B. To profit from the fast
		* shared memory we do it by tiles.
		*/

		typedef Index IndexType;
		typedef Real RealType;
		__shared__ Real tileA[ tileDim*tileDim ];
		__shared__ Real tileB[ tileDim*tileDim ];
		__shared__ Real tileC[ tileDim*tileDim ];

		const IndexType& matrixARows = matrixA->getRows();
		const IndexType& matrixAColumns = matrixA->getColumns();
		const IndexType& matrixBRows = matrixB->getRows();
		const IndexType& matrixBColumns = matrixB->getColumns();

		/****
		* Reset the tile C
		*/
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] = 0.0;

		/****
		* Compute the result tile coordinates
		*/
		const IndexType resultTileRow = ( gridIdx_ygridDim.y + blockIdx.y )tileDim;
		const IndexType resultTileColumn = ( gridIdx_xgridDim.x + blockIdx.x )tileDim;

		/****
		* Sum over the matrix tiles
		*/
		for( IndexType i = 0; i < matrixAColumns; i += tileDim )
		{
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		{
		const IndexType matrixARow = resultTileRow + threadIdx.y + row;
		const IndexType matrixAColumn = i + threadIdx.x;
		if( matrixARow < matrixARows && matrixAColumn < matrixAColumns )
		tileA[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
		matrixAMultiplicator * matrixA->getElementFast( matrixARow, matrixAColumn );

		const IndexType matrixBRow = i + threadIdx.y + row;
		const IndexType matrixBColumn = resultTileColumn + threadIdx.x;
		if( matrixBRow < matrixBRows && matrixBColumn < matrixBColumns )
		tileB[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
		matrixBMultiplicator * matrixB->getElementFast( matrixBRow, matrixBColumn );
		}
		__syncthreads();

		const IndexType tileALastRow = tnlCudaMin( tileDim, matrixARows - resultTileRow );
		const IndexType tileALastColumn = tnlCudaMin( tileDim, matrixAColumns - i );
		const IndexType tileBLastRow = tnlCudaMin( tileDim, matrixBRows - i );
		const IndexType tileBLastColumn =
		tnlCudaMin( tileDim, matrixBColumns - resultTileColumn );

		for( IndexType row = 0; row < tileALastRow; row += tileRowBlockSize )
		{
		RealType sum( 0.0 );
		for( IndexType j = 0; j < tileALastColumn; j++ )
		sum += tileA[ ( threadIdx.y + row )tileDim + j ]
		tileB[ j*tileDim + threadIdx.x ];
		tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] += sum;
		}
		__syncthreads();
		}

		/****
		* Write the result tile to the result matrix
		*/
		const IndexType& matrixCRows = resultMatrix->getRows();
		const IndexType& matrixCColumns = resultMatrix->getColumns();
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		{
		const IndexType matrixCRow = resultTileRow + row + threadIdx.y;
		const IndexType matrixCColumn = resultTileColumn + threadIdx.x;
		if( matrixCRow < matrixCRows && matrixCColumn < matrixCColumns )
		resultMatrix->setElementFast( matrixCRow,
		matrixCColumn,
		tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] );
		}

		}
		#endif

		template< typename Real,
		typename Device,
		typename Index,
		@@ -572,83 +471,13 @@ void DenseMatrix< Real, Device, Index, Organization, RealAllocator >::getMatrixP
		const RealType& matrix1Multiplicator,
		const RealType& matrix2Multiplicator )
		{
		TNL_ASSERT( matrix1.getColumns() == matrix2.getRows() &&
		this->getRows() == matrix1.getRows() &&
		this->getColumns() == matrix2.getColumns(),
		std::cerr << "This matrix columns: " << this->getColumns() << std::endl
		<< "This matrix rows: " << this->getRows() << std::endl
		<< "Matrix1 columns: " << matrix1.getColumns() << std::endl
		<< "Matrix1 rows: " << matrix1.getRows() << std::endl
		<< "Matrix2 columns: " << matrix2.getColumns() << std::endl
		<< "Matrix2 rows: " << matrix2.getRows() << std::endl );

		if( std::is_same< Device, Devices::Host >::value )
		for( IndexType i = 0; i < this->getRows(); i += tileDim )
		for( IndexType j = 0; j < this->getColumns(); j += tileDim )
		{
		const IndexType tileRows = min( tileDim, this->getRows() - i );
		const IndexType tileColumns = min( tileDim, this->getColumns() - j );
		for( IndexType i1 = i; i1 < i + tileRows; i1++ )
		for( IndexType j1 = j; j1 < j + tileColumns; j1++ )
		this->setElementFast( i1, j1, 0.0 );
		TNL_ASSERT_EQ( matrix1.getColumns(), matrix2.getRows(), "" );
		this->setDimensions( matrix1.getRows(), matrix2.getColumns() );

		for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim )
		{
		const IndexType lastK = min( k + tileDim, matrix1.getColumns() );
		for( IndexType i1 = 0; i1 < tileRows; i1++ )
		for( IndexType j1 = 0; j1 < tileColumns; j1++ )
		for( IndexType k1 = k; k1 < lastK; k1++ )
		this->addElementFast( i + i1, j + j1,
		matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) );
		}
		}
		if( std::is_same< Device, Devices::Cuda >::value )
		{
		#ifdef HAVE_CUDA
		dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
		const IndexType matrixProductCudaBlockSize( 256 );
		const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
		const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
		const IndexType cudaBlockColumns( tileDim );
		const IndexType cudaBlockRows( matrixProductCudaBlockSize / tileDim );
		cudaBlockSize.x = cudaBlockColumns;
		cudaBlockSize.y = cudaBlockRows;
		const IndexType rowGrids = roundUpDivision( rowTiles, Cuda::getMaxGridSize() );
		const IndexType columnGrids = roundUpDivision( columnTiles, Cuda::getMaxGridSize() );

		for( IndexType gridIdx_x = 0; gridIdx_x < columnGrids; gridIdx_x++ )
		for( IndexType gridIdx_y = 0; gridIdx_y < rowGrids; gridIdx_y++ )
		{
		cudaGridSize.x = cudaGridSize.y = Cuda::getMaxGridSize();
		if( gridIdx_x == columnGrids - 1 )
		cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
		if( gridIdx_y == rowGrids - 1 )
		cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
		DenseMatrix* this_kernel = Cuda::passToDevice( *this );
		Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
		Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
		DenseMatrixProductKernel< Real,
		Index,
		Matrix1,
		Matrix2,
		tileDim,
		cudaBlockRows >
		<<< cudaGridSize,
		cudaBlockSize,
		3tileDimtileDim >>>
		( this_kernel,
		matrix1_kernel,
		matrix2_kernel,
		this->getView().getMatrixProduct( matrix1.getConstView(),
		matrix2.getConstView(),
		matrix1Multiplicator,
		matrix2Multiplicator,
		gridIdx_x,
		gridIdx_y );
		Cuda::freeFromDevice( this_kernel );
		Cuda::freeFromDevice( matrix1_kernel );
		Cuda::freeFromDevice( matrix2_kernel );
		}
		#endif
		}
		matrix2Multiplicator );
		}

		#ifdef HAVE_CUDA

src/TNL/Matrices/DenseMatrixView.h

+5 −5

Original line number	Diff line number	Diff line
		@@ -228,9 +228,9 @@ class DenseMatrixView : public MatrixView< Real, Device, Index >
		const RealType& matrixMultiplicator = 1.0,
		const RealType& thisMatrixMultiplicator = 1.0 );

		template< typename Matrix1, typename Matrix2, int tileDim = 32 >
		void getMatrixProduct( const Matrix1& matrix1,
		const Matrix2& matrix2,
		template< typename MatrixView1, typename MatrixView2, int tileDim = 32 >
		void getMatrixProduct( const MatrixView1& matrix1,
		const MatrixView2& matrix2,
		const RealType& matrix1Multiplicator = 1.0,
		const RealType& matrix2Multiplicator = 1.0 );

src/TNL/Matrices/DenseMatrixView.hpp

+130 −36

Original line number	Diff line number	Diff line
		@@ -19,6 +19,24 @@
		namespace TNL {
		namespace Matrices {

		#ifdef HAVE_CUDA
		template< int tileDim,
		int tileRowBlockSize,
		typename ResultMatrix,
		typename Matrix1,
		typename Matrix2,
		typename Real,
		typename Index >
		__global__ void
		DenseMatrixProductKernel( ResultMatrix resultMatrix,
		const Matrix1 matrixA,
		const Matrix2 matrixB,
		const Real matrixAMultiplicator,
		const Real matrixBMultiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y );
		#endif

		template< typename Real,
		typename Device,
		typename Index,
		@@ -432,15 +450,9 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con
		const RealType& matrix1Multiplicator,
		const RealType& matrix2Multiplicator )
		{
		TNL_ASSERT( matrix1.getColumns() == matrix2.getRows() &&
		this->getRows() == matrix1.getRows() &&
		this->getColumns() == matrix2.getColumns(),
		std::cerr << "This matrix columns: " << this->getColumns() << std::endl
		<< "This matrix rows: " << this->getRows() << std::endl
		<< "Matrix1 columns: " << matrix1.getColumns() << std::endl
		<< "Matrix1 rows: " << matrix1.getRows() << std::endl
		<< "Matrix2 columns: " << matrix2.getColumns() << std::endl
		<< "Matrix2 rows: " << matrix2.getRows() << std::endl );
		TNL_ASSERT_EQ( matrix1.getColumns(), matrix2.getRows(), "" );
		TNL_ASSERT_EQ( this->getRows(), matrix1.getRows(), "" );
		TNL_ASSERT_EQ( this->getColumns(), matrix2.getColumns(), "" );

		if( std::is_same< Device, Devices::Host >::value )
		for( IndexType i = 0; i < this->getRows(); i += tileDim )
		@@ -450,7 +462,7 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con
		const IndexType tileColumns = min( tileDim, this->getColumns() - j );
		for( IndexType i1 = i; i1 < i + tileRows; i1++ )
		for( IndexType j1 = j; j1 < j + tileColumns; j1++ )
		this->setElementFast( i1, j1, 0.0 );
		( *this )( i1, j1 ) = 0.0;

		for( IndexType k = 0; k < matrix1.getColumns(); k += tileDim )
		{
		@@ -458,14 +470,14 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con
		for( IndexType i1 = 0; i1 < tileRows; i1++ )
		for( IndexType j1 = 0; j1 < tileColumns; j1++ )
		for( IndexType k1 = k; k1 < lastK; k1++ )
		this->addElementFast( i + i1, j + j1,
		matrix1.getElementFast( i + i1, k1 ) * matrix2.getElementFast( k1, j + j1 ) );
		( *this )( i + i1, j + j1 ) +=
		matrix1( i + i1, k1 ) * matrix2( k1, j + j1 );
		}
		}
		if( std::is_same< Device, Devices::Cuda >::value )
		{
		#ifdef HAVE_CUDA
		/*dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
		dim3 cudaBlockSize( 0 ), cudaGridSize( 0 );
		const IndexType matrixProductCudaBlockSize( 256 );
		const IndexType rowTiles = roundUpDivision( this->getRows(), tileDim );
		const IndexType columnTiles = roundUpDivision( this->getColumns(), tileDim );
		@@ -484,29 +496,11 @@ void DenseMatrixView< Real, Device, Index, Organization >::getMatrixProduct( con
		cudaGridSize.x = columnTiles % Cuda::getMaxGridSize();
		if( gridIdx_y == rowGrids - 1 )
		cudaGridSize.y = rowTiles % Cuda::getMaxGridSize();
		Dense* this_kernel = Cuda::passToDevice( *this );
		Matrix1* matrix1_kernel = Cuda::passToDevice( matrix1 );
		Matrix2* matrix2_kernel = Cuda::passToDevice( matrix2 );
		DenseMatrixProductKernel< Real,
		Index,
		Matrix1,
		Matrix2,
		tileDim,
		cudaBlockRows >
		<<< cudaGridSize,
		cudaBlockSize,
		3tileDimtileDim >>>
		( this_kernel,
		matrix1_kernel,
		matrix2_kernel,
		matrix1Multiplicator,
		matrix2Multiplicator,
		gridIdx_x,
		gridIdx_y );
		Cuda::freeFromDevice( this_kernel );
		Cuda::freeFromDevice( matrix1_kernel );
		Cuda::freeFromDevice( matrix2_kernel );
		}*/
		DenseMatrixProductKernel< tileDim, cudaBlockRows, DenseMatrixView< RealType, DeviceType, IndexType >,
		MatrixView1, MatrixView2, RealType, IndexType >
		<<< cudaGridSize, cudaBlockSize, 3 * tileDim*tileDim >>>
		( *this, matrix1, matrix2, matrix1Multiplicator, matrix2Multiplicator, gridIdx_x, gridIdx_y );
		}
		#endif
		}
		}
		@@ -688,5 +682,105 @@ Index DenseMatrixView< Real, Device, Index, Organization >::getElementIndex( con
		return this->segments.getGlobalIndex( row, column );
		}


		#ifdef HAVE_CUDA
		template< int tileDim,
		int tileRowBlockSize,
		typename ResultMatrix,
		typename Matrix1,
		typename Matrix2,
		typename Real,
		typename Index >
		__global__ void
		DenseMatrixProductKernel( ResultMatrix resultMatrix,
		const Matrix1 matrixA,
		const Matrix2 matrixB,
		const Real matrixAMultiplicator,
		const Real matrixBMultiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y )
		{
		/****
		* Here we compute product C = A * B. To profit from the fast
		* shared memory we do it by tiles.
		*/

		typedef Index IndexType;
		typedef Real RealType;
		__shared__ Real tileA[ tileDim*tileDim ];
		__shared__ Real tileB[ tileDim*tileDim ];
		__shared__ Real tileC[ tileDim*tileDim ];

		const IndexType& matrixARows = matrixA.getRows();
		const IndexType& matrixAColumns = matrixA.getColumns();
		const IndexType& matrixBRows = matrixB.getRows();
		const IndexType& matrixBColumns = matrixB.getColumns();

		/****
		* Reset the tile C
		*/
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] = 0.0;

		/****
		* Compute the result tile coordinates
		*/
		const IndexType resultTileRow = ( gridIdx_ygridDim.y + blockIdx.y )tileDim;
		const IndexType resultTileColumn = ( gridIdx_xgridDim.x + blockIdx.x )tileDim;

		/****
		* Sum over the matrix tiles
		*/
		for( IndexType i = 0; i < matrixAColumns; i += tileDim )
		{
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		{
		const IndexType matrixARow = resultTileRow + threadIdx.y + row;
		const IndexType matrixAColumn = i + threadIdx.x;
		if( matrixARow < matrixARows && matrixAColumn < matrixAColumns )
		tileA[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
		matrixAMultiplicator * matrixA( matrixARow, matrixAColumn );

		const IndexType matrixBRow = i + threadIdx.y + row;
		const IndexType matrixBColumn = resultTileColumn + threadIdx.x;
		if( matrixBRow < matrixBRows && matrixBColumn < matrixBColumns )
		tileB[ (threadIdx.y + row)*tileDim + threadIdx.x ] =
		matrixBMultiplicator * matrixB( matrixBRow, matrixBColumn );
		}
		__syncthreads();

		const IndexType tileALastRow = TNL::min( tileDim, matrixARows - resultTileRow );
		const IndexType tileALastColumn = TNL::min( tileDim, matrixAColumns - i );
		const IndexType tileBLastRow = TNL::min( tileDim, matrixBRows - i );
		const IndexType tileBLastColumn = TNL::min( tileDim, matrixBColumns - resultTileColumn );

		for( IndexType row = 0; row < tileALastRow; row += tileRowBlockSize )
		{
		RealType sum( 0.0 );
		for( IndexType j = 0; j < tileALastColumn; j++ )
		sum += tileA[ ( threadIdx.y + row )tileDim + j ]
		tileB[ j*tileDim + threadIdx.x ];
		tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ] += sum;
		}
		__syncthreads();
		}

		/****
		* Write the result tile to the result matrix
		*/
		const IndexType& matrixCRows = resultMatrix.getRows();
		const IndexType& matrixCColumns = resultMatrix.getColumns();
		for( IndexType row = 0; row < tileDim; row += tileRowBlockSize )
		{
		const IndexType matrixCRow = resultTileRow + row + threadIdx.y;
		const IndexType matrixCColumn = resultTileColumn + threadIdx.x;
		if( matrixCRow < matrixCRows && matrixCColumn < matrixCColumns )
		resultMatrix( matrixCRow, matrixCColumn ) = tileC[ ( row + threadIdx.y )*tileDim + threadIdx.x ];
		}

		}
		#endif


		} // namespace Matrices
		} // namespace TNL

src/TNL/Solvers/Linear/Preconditioners/Diagonal_impl.h

+1 −2

Original line number	Diff line number	Diff line
		@@ -32,13 +32,12 @@ update( const MatrixPointer& matrixPointer )
		diagonal.setSize( matrixPointer->getRows() );

		VectorViewType diag_view( diagonal );

		const auto kernel_matrix = matrixPointer->getView();

		// TODO: Rewrite this with SparseMatrix::forAllRows
		auto kernel = [=] __cuda_callable__ ( IndexType i ) mutable
		{
		diag_view[ i ] = kernel_matrix.getElement( i, i );
		//diag_view[ i ] = kernel_matrix.getElement( i, i );
		};

		Algorithms::ParallelFor< DeviceType >::exec( (IndexType) 0, diagonal.getSize(), kernel );