Implementing the CUDA support to the dense matrix format. (4bead8e6) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlDenseMatrix_impl.h

+41 −23

Original line number	Diff line number	Diff line
		@@ -140,6 +140,9 @@ bool tnlDenseMatrix< Real, Device, Index >::setElementFast( const IndexType row,
		const IndexType column,
		const RealType& value )
		{
		tnlAssert( row >= 0 && row < this->getRows() &&
		column >= 0 && column < this->getColumns(),
		printf( " row = %d, column = %d, this->getRows = %d, this->getColumns() = %d \n", row, column, this->getRows(), this->getColumns() ) );
		this->values.operator[]( this->getElementIndex( row, column ) ) = value;
		return true;
		}
		@@ -167,6 +170,9 @@ bool tnlDenseMatrix< Real, Device, Index >::addElementFast( const IndexType row,
		const RealType& value,
		const RealType& thisElementMultiplicator )
		{
		tnlAssert( row >= 0 && row < this->getRows() &&
		column >= 0 && column < this->getColumns(),
		printf( " row = %d, column = %d, this->getRows = %d, this->getColumns() = %d \n", row, column, this->getRows(), this->getColumns() ) );
		const IndexType elementIndex = this->getElementIndex( row, column );
		if( thisElementMultiplicator == 1.0 )
		this->values.operator[]( elementIndex ) += value;
		@@ -283,6 +289,9 @@ template< typename Real,
		Real tnlDenseMatrix< Real, Device, Index >::getElementFast( const IndexType row,
		const IndexType column ) const
		{
		tnlAssert( row >= 0 && row < this->getRows() &&
		column >= 0 && column < this->getColumns(),
		printf( " row = %d, column = %d, this->getRows = %d, this->getColumns() = %d \n", row, column, this->getRows(), this->getColumns() ) );
		return this->values.operator[]( this->getElementIndex( row, column ) );
		}

		@@ -624,16 +633,21 @@ template< typename Real,
		int tileRowBlockSize >
		__global__ void tnlDenseMatrixTranspositionAlignedKernel( tnlDenseMatrix< Real, tnlCuda, Index >* resultMatrix,
		const Matrix* inputMatrix,
		const Real matrixMultiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y )
		{
		__shared__ Real tile[ tileDim*tileDim ];

		const Index columns = inputMatrix->getColumns();
		const Index rows = inputMatrix->getRows();


		/****
		* Diagonal mapping of the CUDA blocks
		*/
		Index blockIdx_x, blockIdx_y;
		if( inputMatrix->getColumns() == inputMatrix->getRows() )
		if( columns == rows )
		{
		blockIdx_y = blockIdx.x;
		blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
		@@ -652,13 +666,13 @@ __global__ void tnlDenseMatrixTranspositionAlignedKernel( tnlDenseMatrix< Real,
		( gridIdx_ygridDim.y + blockIdx_y )tileDim + threadIdx.y;
		const Index readColumnPosition =
		( gridIdx_xgridDim.x + blockIdx_x )tileDim + threadIdx.x;
		//const Index readOffset = readRowPosition * columns + readColumnPosition;
		for( Index rowBlock = 0;
		rowBlock < tileDim;
		rowBlock += tileRowBlockSize )
		{
		tile[ tnlCuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] =
		inputMatrix->getElementFast( readRowPosition + rowBlock, readColumnPosition );
		inputMatrix->getElementFast( readColumnPosition,
		readRowPosition + rowBlock );
		}
		__syncthreads();

		@@ -669,15 +683,14 @@ __global__ void tnlDenseMatrixTranspositionAlignedKernel( tnlDenseMatrix< Real,
		( gridIdx_xgridDim.x + blockIdx_x )tileDim + threadIdx.y;
		const Index writeColumnPosition =
		( gridIdx_ygridDim.y + blockIdx_y )tileDim + threadIdx.x;
		//const Index writeOffset = writeRowPosition * inputMatrixrows + writeColumnPosition;
		for( Index rowBlock = 0;
		rowBlock < tileDim;
		rowBlock += tileRowBlockSize )
		{
		resultMatrix->setElementFast( writeColumnPosition,
		writeRowPosition + rowBlock,
		tile[ tnlCuda::getInterleaving(
		( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ) ] );
		matrixMultiplicator * tile[ tnlCuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );

		}

		}
		@@ -689,16 +702,20 @@ template< typename Real,
		int tileRowBlockSize >
		__global__ void tnlDenseMatrixTranspositionNonAlignedKernel( tnlDenseMatrix< Real, tnlCuda, Index >* resultMatrix,
		const Matrix* inputMatrix,
		const Real matrixMultiplicator,
		const Index gridIdx_x,
		const Index gridIdx_y )
		{
		__shared__ Real tile[ tileDim*tileDim ];

		const Index columns = inputMatrix->getColumns();
		const Index rows = inputMatrix->getRows();

		/****
		* Diagonal mapping of the CUDA blocks
		*/
		Index blockIdx_x, blockIdx_y;
		if( inputMatrix->getColumns() == inputMatrix->getRows() )
		if( columns == rows )
		{
		blockIdx_y = blockIdx.x;
		blockIdx_x = (blockIdx.x+blockIdx.y)%gridDim.x;
		@@ -717,18 +734,17 @@ __global__ void tnlDenseMatrixTranspositionNonAlignedKernel( tnlDenseMatrix< Rea
		( gridIdx_ygridDim.y + blockIdx_y )tileDim + threadIdx.y;
		const Index readColumnPosition =
		( gridIdx_xgridDim.x + blockIdx_x )tileDim + threadIdx.x;
		if( readColumnPosition < inputMatrix->getColumns() )
		if( readColumnPosition < columns )
		{
		//const Index readOffset = readRowPosition * columns + readColumnPosition;
		const Index readOffset = readRowPosition * columns + readColumnPosition;
		for( Index rowBlock = 0;
		rowBlock < tileDim;
		rowBlock += tileRowBlockSize )
		{
		if( readRowPosition + rowBlock < inputMatrix->getRows() )
		tile[ tnlCuda::getInterleaving(
		( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ) ] =
		inputMatrix->getElementFast( readRowPosition + rowBlock, readColumnPosition );

		if( readRowPosition + rowBlock < rows )
		tile[ tnlCuda::getInterleaving( threadIdx.x*tileDim + threadIdx.y + rowBlock ) ] =
		inputMatrix->getElementFast( readColumnPosition,
		readRowPosition + rowBlock );
		}
		}
		__syncthreads();
		@@ -740,20 +756,20 @@ __global__ void tnlDenseMatrixTranspositionNonAlignedKernel( tnlDenseMatrix< Rea
		( gridIdx_xgridDim.x + blockIdx_x )tileDim + threadIdx.y;
		const Index writeColumnPosition =
		( gridIdx_ygridDim.y + blockIdx_y )tileDim + threadIdx.x;
		if( writeColumnPosition < inputMatrix->getRows() )
		if( writeColumnPosition < rows )
		{
		//const Index writeOffset = writeRowPosition * rows + writeColumnPosition;
		const Index writeOffset = writeRowPosition * rows + writeColumnPosition;
		for( Index rowBlock = 0;
		rowBlock < tileDim;
		rowBlock += tileRowBlockSize )
		{
		if( writeRowPosition + rowBlock < inputMatrix->getColumns() )
		resultMatrix->setElementFast( writeRowPosition + rowBlock,
		writeColumnPosition,
		tile[ tnlCuda::getInterleaving(
		( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ) ] );
		if( writeRowPosition + rowBlock < columns )
		resultMatrix->setElementFast( writeColumnPosition,
		writeRowPosition + rowBlock,
		matrixMultiplicator * tile[ tnlCuda::getInterleaving( ( threadIdx.y + rowBlock ) * tileDim + threadIdx.x ) ] );
		}
		}

		}


		@@ -783,7 +799,7 @@ void tnlDenseMatrix< Real, Device, Index >::getTransposition( const Matrix& matr
		for( IndexType j = 0; j < columns; j += tileDim )
		for( IndexType k = i; k < i + tileDim && k < rows; k++ )
		for( IndexType l = j; l < j + tileDim && l < columns; l++ )
		this->setElement( l, k, matrix. getElement( k, l ) );
		this->setElement( l, k, matrixMultiplicator * matrix. getElement( k, l ) );
		}
		if( Device::getDevice() == tnlCudaDevice )
		{
		@@ -814,7 +830,6 @@ void tnlDenseMatrix< Real, Device, Index >::getTransposition( const Matrix& matr
		if( ( gridIdx_x < columnGrids - 1 \|\| matrix.getColumns() % tileDim == 0 ) &&
		( gridIdx_y < rowGrids - 1 \|\| matrix.getRows() % tileDim == 0 ) )
		{
		cerr << "Aligned" << endl;
		tnlDenseMatrixTranspositionAlignedKernel< Real,
		Index,
		Matrix,
		@@ -825,6 +840,7 @@ void tnlDenseMatrix< Real, Device, Index >::getTransposition( const Matrix& matr
		sharedMemorySize >>>
		( this_device,
		matrix_device,
		matrixMultiplicator,
		gridIdx_x,
		gridIdx_y );
		}
		@@ -840,9 +856,11 @@ void tnlDenseMatrix< Real, Device, Index >::getTransposition( const Matrix& matr
		sharedMemorySize >>>
		( this_device,
		matrix_device,
		matrixMultiplicator,
		gridIdx_x,
		gridIdx_y );
		}
		checkCudaDevice;
		}
		tnlCuda::freeFromDevice( this_device );
		tnlCuda::freeFromDevice( matrix_device );

src/matrices/tnlMatrix.h

+2 −0

Original line number	Diff line number	Diff line
		@@ -109,6 +109,8 @@ class tnlMatrix : public virtual tnlObject

		IndexType rows, columns;

		public: // TODO: remove this

		tnlVector< Real, Device, Index > values;
		};

tests/unit-tests/matrices/tnlDenseMatrixTester.h

+17 −21

Original line number	Diff line number	Diff line
		@@ -314,38 +314,34 @@ class tnlDenseMatrixTester : public CppUnit :: TestCase

		void matrixTranspositionTest()
		{
		const int size = 10;
		const int alignedSize = 64;
		MatrixType m;
		m.setDimensions( 10, 10 );
		for( int i = 0; i < size; i++ )
		for( int j = 0; j < size; j++ )
		m.setElement( i, j, i*size + j );
		m.setDimensions( alignedSize, alignedSize );
		for( int i = 0; i < alignedSize; i++ )
		for( int j = 0; j < alignedSize; j++ )
		m.setElement( i, j, i*alignedSize + j );

		MatrixType mTransposed;
		mTransposed.setLike( m );
		mTransposed. template getTransposition< MatrixType, 4 >( m );

		//cout << m << endl;
		//cout << mTransposed << endl;
		mTransposed. template getTransposition< MatrixType, 32 >( m );

		for( int i = 0; i < size; i++ )
		for( int j = 0; j < size; j++ )
		for( int i = 0; i < alignedSize; i++ )
		for( int j = 0; j < alignedSize; j++ )
		CPPUNIT_ASSERT( m.getElement( i, j ) == mTransposed.getElement( j, i ) );

		mTransposed. template getTransposition< MatrixType, 5 >( m );
		const int nonAlignedSize = 50;
		m.setDimensions( nonAlignedSize, nonAlignedSize );
		for( int i = 0; i < nonAlignedSize; i++ )
		for( int j = 0; j < nonAlignedSize; j++ )
		m.setElement( i, j, i*nonAlignedSize + j );

		//cout << m << endl;
		//cout << mTransposed << endl;
		mTransposed.setLike( m );
		mTransposed. template getTransposition< MatrixType, 32 >( m );

		for( int i = 0; i < size; i++ )
		for( int j = 0; j < size; j++ )
		for( int i = 0; i < nonAlignedSize; i++ )
		for( int j = 0; j < nonAlignedSize; j++ )
		CPPUNIT_ASSERT( m.getElement( i, j ) == mTransposed.getElement( j, i ) );

		}




		};

		#ifdef HAVE_CUDA