Implementing the SlicedEllpack matrix in CUDA. (e72fc80b) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlEllpackMatrix_impl.h

+3 −3

Original line number	Diff line number	Diff line
		@@ -668,7 +668,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		static Index getRowBegin( const tnlEllpackMatrix< Real, Device, Index >& matrix,
		const Index row )
		{
		return row * matrix.rowLengths;
		return row;
		}

		template< typename Real,
		@@ -679,7 +679,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		static Index getRowEnd( const tnlEllpackMatrix< Real, Device, Index >& matrix,
		const Index row )
		{
		return ( row + 1 ) * matrix.rowLengths;
		return row + getElementStep( matrix ) * matrix.rowLengths;
		}

		template< typename Real,
		@@ -689,7 +689,7 @@ class tnlEllpackMatrixDeviceDependentCode< tnlCuda >
		#endif
		static Index getElementStep( const tnlEllpackMatrix< Real, Device, Index >& matrix )
		{
		return 1;
		return matrix.alignedRows;
		}
		};

src/implementation/matrices/tnlSlicedEllpackMatrix_impl.h

+148 −16

Original line number	Diff line number	Diff line
		@@ -75,26 +75,11 @@ bool tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::setRowLengths( co
		if( ! this->sliceRowLengths.setSize( slices ) \|\|
		! this->slicePointers.setSize( slices + 1 ) )
		return false;
		IndexType row( 0 ), slice( 0 ), sliceRowLength( 0 );

		/****
		* Compute maximal row length in each slice
		*/
		while( row < this->rows )
		{
		sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
		if( row % SliceSize == 0 )
		{
		this->sliceRowLengths.setElement( slice, sliceRowLength );
		this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
		sliceRowLength = 0;
		}
		}
		if( row % SliceSize != 0 )
		{
		this->sliceRowLengths.setElement( slice, sliceRowLength );
		this->slicePointers.setElement( slice++, sliceRowLength*SliceSize );
		}
		DeviceDependentCode::computeMaximalRowLengthInSlices( *this, rowLengths );

		/****
		* Compute the slice pointers using the exclusive prefix sum
		@@ -567,4 +552,151 @@ void tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::print( ostream& s
		}
		}

		template<>
		class tnlSlicedEllpackMatrixDeviceDependentCode< tnlHost >
		{
		public:

		typedef tnlHost Device;

		template< typename Real,
		typename Index,
		int SliceSize >
		static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const Index row )
		{
		return row * matrix.rowLengths;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const Index row )
		{
		return ( row + 1 ) * matrix.rowLengths;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
		{
		return 1;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
		{
		Index row( 0 ), slice( 0 ), sliceRowLength( 0 );
		while( row < matrix.getRows() )
		{
		sliceRowLength = Max( rowLengths.getElement( row++ ), sliceRowLength );
		if( row % SliceSize == 0 )
		{
		matrix.sliceRowLengths.setElement( slice, sliceRowLength );
		matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
		sliceRowLength = 0;
		}
		}
		if( row % SliceSize != 0 )
		{
		matrix.sliceRowLengths.setElement( slice, sliceRowLength );
		matrix.slicePointers.setElement( slice++, sliceRowLength * SliceSize );
		}
		}
		};

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Device,
		typename Index,
		int SliceSize >
		__global__ void tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >* matrix,
		const typename tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >::RowLentghsVector* rowLengths,
		int gridIdx )
		{

		}
		#endif

		template<>
		class tnlSlicedEllpackMatrixDeviceDependentCode< tnlCuda >
		{
		public:

		typedef tnlCuda Device;

		template< typename Real,
		typename Index,
		int SliceSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		static Index getRowBegin( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const Index row )
		{
		return row * matrix.rowLengths;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		static Index getRowEnd( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const Index row )
		{
		return ( row + 1 ) * matrix.rowLengths;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		static Index getElementStep( const tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix )
		{
		return 1;
		}

		template< typename Real,
		typename Index,
		int SliceSize >
		static bool computeMaximalRowLengthInSlices( tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize >& matrix,
		const typename tnlSlicedEllpackMatrix< Real, Device, Index >::RowLengthsVector& rowLengths )
		{
		#ifdef HAVE_CUDA
		typedef tnlSlicedEllpackMatrix< Real, Device, Index, SliceSize > Matrix;
		typedef typename Matrix::RowLengthsVector RowLengthsVector;
		Matrix* kernel_matrix = tnlCuda::passToDevice( matrix );
		RowLengthsVector* kernel_rowLengths = tnlCuda::passToDevice( rowLengths );
		const Index numberOfSlices = roundUpDivision( matrix.getRows(), SliceSize );
		dim3 cudaBlockSize( 256 ), cudaGridSize( tnlCuda::getMaxGridSize() );
		const IndexType cudaBlocks = roundUpDivision( numberOfSlices, cudaBlockSize.x );
		const IndexType cudaGrids = roundUpDivision( cudaBlocks, tnlCuda::getMaxGridSize() );
		for( IndexType gridIdx = 0; gridIdx < cudaGrids; gridIdx++ )
		{
		if( gridIdx == cudaGrids - 1 )
		cudaGridSize.x = cudaBlocks % tnlCuda::getMaxGridSize();
		tnlSlicedEllpackMatrix_compuetMaximalRowLengthInSlices_CudaKernel<<< cudaGridSize, cudaBlockSize >>>
		( kernel_matrix,
		kernel_rowLengths,
		gridIdx );
		}
		tnlCuda::freeFromDevice( kernel_matrix );
		tnlCuda::freeFromDevice( kernel_rowLengths );
		checkCudaDevice;
		#endif
		}
		};



		#endif /* TNLSLICEDELLPACKMATRIX_IMPL_H_ */

src/matrices/tnlSlicedEllpackMatrix.h

+17 −1

Original line number	Diff line number	Diff line
		@@ -21,6 +21,9 @@
		#include <matrices/tnlSparseMatrix.h>
		#include <core/vectors/tnlVector.h>

		template< typename Device >
		class tnlSlicedEllpackMatrixDeviceDependentCode;

		template< typename Real = double,
		typename Device = tnlHost,
		typename Index = int,
		@@ -33,6 +36,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		typedef Device DeviceType;
		typedef Index IndexType;
		typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::RowLengthsVector RowLengthsVector;
		typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ValuesVector ValuesVector;
		typedef typename tnlSparseMatrix< RealType, DeviceType, IndexType >::ColumnIndexesVector ColumnIndexesVector;
		typedef tnlSlicedEllpackMatrix< Real, Device, Index > ThisType;

		tnlSlicedEllpackMatrix();

		@@ -130,6 +136,13 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		IndexType* columns,
		RealType* values ) const;

		template< typename Vector >
		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		typename Vector::RealType rowVectorProduct( const IndexType row,
		const Vector& vector ) const;

		template< typename Vector >
		void vectorProduct( const Vector& inVector,
		Vector& outVector ) const;
		@@ -163,6 +176,9 @@ class tnlSlicedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		tnlVector< Index, Device, Index > slicePointers, sliceRowLengths;

		typedef tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
		friend class tnlSlicedEllpackMatrixDeviceDependentCode< DeviceType >;

		};

		#include <implementation/matrices/tnlSlicedEllpackMatrix_impl.h>