Implementing tridiagonal matrix in CUDA. (190cc1a7) · Commits · TNL / tnl-dev

.gitignore

+2 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,8 @@
		/Debug
		/Release

		.settings

		# /src/
		/src/Makefile.in

CMakeLists.txt

+1 −0

Original line number	Diff line number	Diff line
		@@ -34,6 +34,7 @@ endif()

		if( WITH_CUDA STREQUAL "yes" )
		AddCompilerFlag( "-DHAVE_NOT_CXX11 -U_GLIBCXX_ATOMIC_BUILTINS -U_GLIBCXX_USE_INT128" )
		set( CUDA_LINKER_OPTIONS "-arch sm_20 -shared" ) # TODO: fix this in the rest of cmake files
		else()
		AddCompilerFlag( "-std=gnu++0x" )
		endif()

install

+1 −1

Original line number	Diff line number	Diff line
		@@ -2,7 +2,7 @@

		TARGET=TNL
		INSTALL_PREFIX=${HOME}/local
		WITH_CUDA=no
		WITH_CUDA=yes
		WITH_CUSPARSE=no
		CUDA_ARCHITECTURE=2.0
		TEMPLATE_EXPLICIT_INSTANTIATION=yes

src/implementation/matrices/tnlTridiagonalMatrix_impl.h

+216 −66

Original line number	Diff line number	Diff line
		@@ -21,6 +21,85 @@
		#include <core/tnlAssert.h>
		#include <matrices/tnlTridiagonalMatrix.h>

		template< typename Device >
		class tnlTridiagonalMatrixDeviceDependentCode;

		template<>
		class tnlTridiagonalMatrixDeviceDependentCode< tnlHost >
		{
		public:

		template< typename Index >
		static Index getElementIndex( const Index rows,
		const Index row,
		const Index column )
		{
		return 3*row + column - row;
		}

		template< typename Vector,
		typename Index,
		typename ValuesType >
		static typename Vector::RealType rowVectorProduct( const Index rows,
		const ValuesType& values,
		const Index row,
		const Vector& vector )
		{
		if( row == 0 )
		return vector[ 0 ] * values[ 0 ] +
		vector[ 1 ] * values[ 1 ];
		Index i = 3 * row - 1;
		if( row == rows - 1 )
		return vector[ row - 1 ] * values[ i++ ] +
		vector[ row ] * values[ i ];
		return vector[ row - 1 ] * values[ i++ ] +
		vector[ row ] * values[ i++ ] +
		vector[ row + 1 ] * values[ i ];
		}
		};

		template<>
		class tnlTridiagonalMatrixDeviceDependentCode< tnlCuda >
		{
		public:

		template< typename Index >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		static Index getElementIndex( const Index rows,
		const Index row,
		const Index column )
		{
		return ( column - row + 1 )*rows + row - 1;
		}

		template< typename Vector,
		typename Index,
		typename ValuesType >
		#ifdef HAVE_CUDA
		__device__
		#endif
		static typename Vector::RealType rowVectorProduct( const Index rows,
		const ValuesType& values,
		const Index row,
		const Vector& vector )
		{
		if( row == 0 )
		return vector[ 0 ] * values[ 0 ] +
		vector[ 1 ] * values[ rows - 1 ];
		Index i = row - 1;
		if( row == rows - 1 )
		return vector[ row - 1 ] * values[ i ] +
		vector[ row ] * values[ i + rows ];
		return vector[ row - 1 ] * values[ i ] +
		vector[ row ] * values[ i + rows ] +
		vector[ row + 1 ] * values[ i + 2*rows ];
		}
		};



		template< typename Real,
		typename Device,
		typename Index >
		@@ -58,6 +137,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setDimensions( const IndexType
		if( ! values.setSize( 3*Min( rows, columns ) ) )
		return false;
		this->values.setValue( 0.0 );
		return true;
		}

		template< typename Real,
		@@ -174,7 +254,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElementFast( const IndexTyp
		const IndexType column,
		const RealType& value )
		{
		this->values.setElement( this->getElementIndex( row, column ), value );
		this->values[ this->getElementIndex( row, column ) ] = value;
		return true;
		}

		@@ -189,7 +269,6 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setElement( const IndexType ro
		return true;
		}


		template< typename Real,
		typename Device,
		typename Index >
		@@ -201,7 +280,8 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElementFast( const IndexTyp
		const RealType& value,
		const RealType& thisElementMultiplicator )
		{
		this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
		const Index i = this->getElementIndex( row, column );
		this->values[ i ] = thisElementMultiplicator*this->values[ i ] + value;
		}

		template< typename Real,
		@@ -212,10 +292,10 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addElement( const IndexType ro
		const RealType& value,
		const RealType& thisElementMultiplicator )
		{
		//this->values.addElement( this->getElementIndex( row, column ), value, thisElementMultiplicator );
		const Index i = this->getElementIndex( row, column );
		this->values.setElement( i, thisElementMultiplicator * this->values.getElement( i ) + value );
		}


		template< typename Real,
		typename Device,
		typename Index >
		@@ -231,7 +311,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRowFast( const IndexType ro
		cerr << " elements = " << elements
		<< " this->columns = " << this->columns
		<< " this->getName() = " << this->getName() );
		return this->addRow( row, columns, values, elements, 0.0 );
		return this->addRowFast( row, columns, values, elements, 0.0 );
		}

		template< typename Real,
		@@ -246,10 +326,9 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::setRow( const IndexType row,
		cerr << " elements = " << elements
		<< " this->columns = " << this->columns
		<< " this->getName() = " << this->getName() );
		//return this->addRow( row, columns, values, elements, 0.0 );
		return this->addRow( row, columns, values, elements, 0.0 );
		}


		template< typename Real,
		typename Device,
		typename Index >
		@@ -273,7 +352,7 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRowFast( const IndexType ro
		const IndexType& column = columns[ i ];
		if( column < row - 1 \|\| column > row + 1 )
		return false;
		addElement( row, column, values[ i ], thisRowMultiplicator );
		addElementFast( row, column, values[ i ], thisRowMultiplicator );
		}
		return true;
		}
		@@ -291,16 +370,16 @@ bool tnlTridiagonalMatrix< Real, Device, Index >::addRow( const IndexType row,
		cerr << " elements = " << elements
		<< " this->columns = " << this->columns
		<< " this->getName() = " << this->getName() );
		/*if( elements > 3 )
		if( elements > 3 )
		return false;
		for( IndexType i = 0; i < elements; i++ )
		{
		const IndexType& column = columns[ i ];
		const IndexType column = columns[ i ];
		if( column < row - 1 \|\| column > row + 1 )
		return false;
		addElement( row, column, values[ i ], thisRowMultiplicator );
		}
		return true;*/
		return true;
		}

		template< typename Real,
		@@ -314,7 +393,7 @@ Real tnlTridiagonalMatrix< Real, Device, Index >::getElementFast( const IndexTyp
		{
		if( abs( column - row ) > 1 )
		return 0.0;
		return this->values.getElement( this->getElementIndex( row, column ) );
		return this->values[ this->getElementIndex( row, column ) ];
		}

		template< typename Real,
		@@ -323,10 +402,11 @@ template< typename Real,
		Real tnlTridiagonalMatrix< Real, Device, Index >::getElement( const IndexType row,
		const IndexType column ) const
		{
		return this->getElementFast( row, column );
		if( abs( column - row ) > 1 )
		return 0.0;
		return this->values.getElement( this->getElementIndex( row, column ) );
		}


		template< typename Real,
		typename Device,
		typename Index >
		@@ -357,56 +437,50 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getRow( const IndexType row,
		IndexType* columns,
		RealType* values ) const
		{
		/*IndexType elementPointer( 0 );
		IndexType elementPointer( 0 );
		for( IndexType i = -1; i <= 1; i++ )
		{
		const IndexType column = row + 1;
		if( column >= 0 && column < this->getColumns() )
		{
		columns[ elementPointer ] = column;
		values[ elementPointer ] = this->values[ this->getElementIndex( row, column ) ];
		values[ elementPointer ] = this->values.getElement( this->getElementIndex( row, column ) );
		elementPointer++;
		}
		}*/
		}


		template< typename Real,
		typename Device,
		typename Index >
		Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
		const IndexType column )
		{
		return this->values[ this->getElementIndex( row, column ) ];
		}

		template< typename Real,
		typename Device,
		typename Index >
		const Real& tnlTridiagonalMatrix< Real, Device, Index >::operator()( const IndexType row,
		const IndexType column ) const
		template< typename Vector >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
		const Vector& vector ) const
		{
		return this->values[ this->getElementIndex( row, column ) ];
		return tnlTridiagonalMatrixDeviceDependentCode< Device >::
		rowVectorProduct( this->rows,
		this->values,
		row,
		vector );
		}

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Device,
		typename Index >
		template< typename Vector >
		typename Vector::RealType tnlTridiagonalMatrix< Real, Device, Index >::rowVectorProduct( const IndexType row,
		const Vector& vector ) const
		typename Index,
		typename Vector >
		__global__ void tnlTridiagonalMatrixVectorProductCudaKernel( tnlTridiagonalMatrix< Real, tnlCuda, Index >* matrix,
		const Vector* inVector,
		Vector* outVector,
		const Index gridIdx )
		{
		if( row == 0 )
		return vector[ 0 ] * this->values[ 0 ] +
		vector[ 1 ] * this->values[ 1 ];
		IndexType i = 3 * row - 1;
		if( row == this->rows - 1 )
		return vector[ row - 1 ] * this->values[ i++ ] +
		vector[ row ] * this->values[ i ];
		return vector[ row - 1 ] * this->values[ i++ ] +
		vector[ row ] * this->values[ i++ ] +
		vector[ row + 1 ] * this->values[ i ];
		const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx < matrix->getRows() )
		( outVector )[ rowIdx ] = matrix->rowVectorProduct( rowIdx, inVector );
		}
		#endif

		template< typename Real,
		typename Device,
		@@ -426,8 +500,32 @@ void tnlTridiagonalMatrix< Real, Device, Index >::vectorProduct( const Vector& i
		<< "Vector size: " << outVector.getSize() << endl
		<< "Vector name: " << outVector.getName() << endl );

		if( Device::getDevice() == tnlHostDevice )
		for( IndexType row = 0; row < this->getRows(); row++ )
		outVector[ row ] = this->rowVectorProduct( row, inVector );

		if( Device::getDevice() == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		ThisType* kernel_this = tnlCuda::passToDevice( *this );
		Vector* kernel_inVector = tnlCuda::passToDevice( inVector );
		Vector* kernel_outVector = tnlCuda::passToDevice( outVector );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( this->getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlTridiagonalMatrixVectorProductCudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_this, kernel_inVector, kernel_outVector, gridIdx );
		}
		tnlCuda::freeFromDevice( kernel_this );
		tnlCuda::freeFromDevice( kernel_inVector );
		tnlCuda::freeFromDevice( kernel_outVector );
		checkCudaDevice;
		#endif
		}
		}

		template< typename Real,
		@@ -443,31 +541,52 @@ void tnlTridiagonalMatrix< Real, Device, Index >::addMatrix( const tnlTridiagona
		<< "This matrix rows: " << this->getRows() << endl
		<< "This matrix name: " << this->getName() << endl );

		const IndexType lastI = this->values.getSize();
		if( thisMatrixMultiplicator == 1.0 )
		{
		for( IndexType i = 0; i < lastI; i++ )
		this->values[ i ] += matrixMultiplicator*matrix.values[ i ];
		}
		this->values.alphaXPlusY( matrixMultiplicator, matrix.values );
		else
		{
		for( IndexType i = 0; i < lastI; i++ )
		this->values[ i ] = thisMatrixMultiplicator * this->values[ i ] +
		matrixMultiplicator*matrix.values[ i ];
		this->values.alphaXPlusBetaY( matrixMultiplicator, matrix.values, thisMatrixMultiplicator );
		}

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Real2,
		typename Index,
		typename Index2 >
		__global__ void tnlTridiagonalMatrixTranspositionCudaKernel( const tnlTridiagonalMatrix< Real2, tnlCuda, Index2 >* inMatrix,
		tnlTridiagonalMatrix< Real, tnlCuda, Index >* outMatrix,
		const Real matrixMultiplicator,
		const Index gridIdx )
		{
		const Index rowIdx = ( gridIdx * tnlCuda::getMaxGridSize() + blockIdx.x ) * blockDim.x + threadIdx.x;
		if( rowIdx < inMatrix->getRows() )
		{
		if( rowIdx > 0 )
		outMatrix->setElementFast( rowIdx-1,
		rowIdx,
		matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx-1 ) );
		outMatrix->setElementFast( rowIdx,
		rowIdx,
		matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx ) );
		if( rowIdx < inMatrix->getRows()-1 )
		outMatrix->setElementFast( rowIdx+1,
		rowIdx,
		matrixMultiplicator * inMatrix->getElementFast( rowIdx, rowIdx+1 ) );
		}
		}
		#endif

		template< typename Real,
		typename Device,
		typename Index >
		template< typename Real2, typename Index2, int tileDim >
		template< typename Real2, typename Index2 >
		void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
		const RealType& matrixMultiplicator )
		{
		tnlAssert( this->getRows() == matrix.getRows(),
		cerr << "This matrix rows: " << this->getRows() << endl
		<< "That matrix rows: " << matrix.getRows() << endl );

		if( Device::getDevice() == tnlHostDevice )
		{
		const IndexType& rows = matrix.getRows();
		for( IndexType i = 1; i < rows; i++ )
		{
		@@ -477,6 +596,31 @@ void tnlTridiagonalMatrix< Real, Device, Index >::getTransposition( const tnlTri
		this->setElement( i - 1, i, aux );
		}
		}
		if( Device::getDevice() == tnlCudaDevice )
		{
		#ifdef HAVE_CUDA
		ThisType* kernel_this = tnlCuda::passToDevice( *this );
		typedef tnlTridiagonalMatrix< Real2, Device, Index2 > InMatrixType;
		InMatrixType* kernel_inMatrix = tnlCuda::passToDevice( matrix );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( matrix.getRows(), cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlTridiagonalMatrixTranspositionCudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_inMatrix,
		kernel_this,
		matrixMultiplicator,
		gridIdx );
		}
		tnlCuda::freeFromDevice( kernel_this );
		tnlCuda::freeFromDevice( kernel_inMatrix );
		checkCudaDevice;
		#endif
		}
		}

		template< typename Real,
		typename Device,
		@@ -547,7 +691,7 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
		str <<"Row: " << row << " -> ";
		for( IndexType column = row - 1; column < row + 2; column++ )
		if( column >= 0 && column < this->columns )
		str << " Col:" << column << "->" << this->operator()( row, column ) << "\t";
		str << " Col:" << column << "->" << this->getElement( row, column ) << "\t";
		str << endl;
		}
		}
		@@ -555,15 +699,21 @@ void tnlTridiagonalMatrix< Real, Device, Index >::print( ostream& str ) const
		template< typename Real,
		typename Device,
		typename Index >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		Index tnlTridiagonalMatrix< Real, Device, Index >::getElementIndex( const IndexType row,
		const IndexType column ) const
		{
		// TODO: remove the #ifndef when CUDA supports std:cerr
		#ifndef HAVE_CUDA
		tnlAssert( row >= 0 && column >= 0 && row < this->rows && column < this->rows,
		cerr << " this->rows = " << this->rows
		<< " row = " << row << " column = " << column );
		tnlAssert( abs( row - column ) < 2,
		cerr << "row = " << row << " column = " << column << endl );
		return 3*row + column - row;
		#endif
		return tnlTridiagonalMatrixDeviceDependentCode< Device >::getElementIndex( this->rows, row, column );
		}

src/matrices/tnlTridiagonalMatrix.h

+28 −7

Original line number	Diff line number	Diff line
		@@ -32,6 +32,7 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		typedef Device DeviceType;
		typedef Index IndexType;
		typedef typename tnlMatrix< Real, Device, Index >::RowLengthsVector RowLengthsVector;
		typedef tnlTridiagonalMatrix< Real, Device, Index > ThisType;

		tnlTridiagonalMatrix();

		@@ -82,6 +83,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		const RealType& value,
		const RealType& thisElementMultiplicator = 1.0 );

		bool addElement( const IndexType row,
		const IndexType column,
		const RealType& value,
		const RealType& thisElementMultiplicator = 1.0 );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		@@ -90,6 +96,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		const RealType* values,
		const IndexType elements );

		bool setRow( const IndexType row,
		const IndexType* columns,
		const RealType* values,
		const IndexType elements );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		@@ -99,6 +110,12 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		const IndexType elements,
		const RealType& thisRowMultiplicator = 1.0 );

		bool addRow( const IndexType row,
		const IndexType* columns,
		const RealType* values,
		const IndexType elements,
		const RealType& thisRowMultiplicator = 1.0 );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		@@ -115,13 +132,14 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		IndexType* columns,
		RealType* values ) const;

		RealType& operator()( const IndexType row,
		const IndexType column );

		const RealType& operator()( const IndexType row,
		const IndexType column ) const;
		void getRow( const IndexType row,
		IndexType* columns,
		RealType* values ) const;

		template< typename Vector >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		typename Vector::RealType rowVectorProduct( const IndexType row,
		const Vector& vector ) const;

		@@ -135,11 +153,11 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >
		const RealType& thisMatrixMultiplicator = 1.0 );

		#ifdef HAVE_NOT_CXX11
		template< typename Real2, typename Index2, int tileDim >
		template< typename Real2, typename Index2 >
		void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
		const RealType& matrixMultiplicator = 1.0 );
		#else
		template< typename Real2, typename Index2, int tileDim = 32 >
		template< typename Real2, typename Index2 >
		void getTransposition( const tnlTridiagonalMatrix< Real2, Device, Index2 >& matrix,
		const RealType& matrixMultiplicator = 1.0 );
		#endif
		@@ -162,6 +180,9 @@ class tnlTridiagonalMatrix : public tnlMatrix< Real, Device, Index >

		protected:

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		IndexType getElementIndex( const IndexType row,
		const IndexType column ) const;