Implementing the Chunked Ellpack format in CUDA. (0fb43230) · Commits · TNL / tnl-dev

src/implementation/matrices/tnlChunkedEllpackMatrix_impl.h

+449 −102

File changed.

Preview size limit exceeded, changes collapsed.

src/implementation/matrices/tnlSparseMatrix_impl.h

+7 −0

Original line number	Diff line number	Diff line
		@@ -111,5 +111,12 @@ bool tnlSparseMatrix< Real, Device, Index >::allocateMatrixElements( const Index
		return true;
		}

		template< typename Real,
		typename Device,
		typename Index >
		void tnlSparseMatrix< Real, Device, Index >::printStructure( ostream& str ) const
		{

		}

		#endif /* TNLSPARSEMATRIX_IMPL_H_ */

src/matrices/tnlChunkedEllpackMatrix.h

+79 −12

Original line number	Diff line number	Diff line
		@@ -42,6 +42,16 @@ struct tnlChunkedEllpackSliceInfo
		{ return tnlString( "tnlChunkedEllpackSliceInfo" ); };
		};

		#ifdef HAVE_CUDA
		template< typename Real,
		typename Index,
		typename Vector >
		__global__ void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const Vector* inVector,
		Vector* outVector,
		int gridIdx );
		#endif

		template< typename Real, typename Device, typename Index >
		class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		{
		@@ -85,6 +95,11 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		IndexType getDesiredChunkSize() const;

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		IndexType getNumberOfSlices() const;

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		@@ -165,6 +180,14 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		typename Vector::RealType rowVectorProduct( const IndexType row,
		const Vector& vector ) const;

		#ifdef HAVE_CUDA
		template< typename Vector >
		__device__ void computeSliceVectorProduct( const Vector* inVector,
		Vector* outVector,
		int gridIdx ) const;
		#endif


		template< typename Vector >
		void vectorProduct( const Vector& inVector,
		Vector& outVector ) const;
		@@ -194,11 +217,12 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		void print( ostream& str ) const;

		void printStructure( ostream& str ) const;

		protected:


		void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths,
		IndexType& numberOfSlices );
		void resolveSliceSizes( const tnlVector< Index, tnlHost, Index >& rowLengths );

		bool setSlice( const RowLengthsVector& rowLengths,
		const IndexType sliceIdx,
		@@ -211,6 +235,9 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		RealType& value,
		RealType& thisElementMultiplicator );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool addElementToChunkFast( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		@@ -218,18 +245,60 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >
		RealType& value,
		RealType& thisElementMultiplicator );

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		void setChunkFast( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		const IndexType* columnIndexes,
		const RealType* values,
		const IndexType elements );

		void setChunk( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		const IndexType* columnIndexes,
		const RealType* values,
		const IndexType elements );

		bool getElementInChunk( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		const IndexType column,
		RealType& value ) const;

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		bool getElementInChunkFast( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		const IndexType column,
		RealType& value ) const;

		void getChunk( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		IndexType* columns,
		RealType* values ) const;

		#ifdef HAVE_CUDA
		__device__ __host__
		#endif
		void getChunkFast( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		IndexType* columns,
		RealType* values ) const;

		template< typename Vector >
		typename Vector::RealType chunkVectorProduct( const IndexType sliceOffset,
		const IndexType chunkIndex,
		const IndexType chunkSize,
		const Vector& vector ) const;



		IndexType chunksInSlice, desiredChunkSize;

		@@ -237,22 +306,20 @@ class tnlChunkedEllpackMatrix : public tnlSparseMatrix< Real, Device, Index >

		tnlArray< ChunkedEllpackSliceInfo, Device, Index > slices;

		//IndexType numberOfSlices;
		IndexType numberOfSlices;

		typedef tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType > DeviceDependentCode;
		friend class tnlChunkedEllpackMatrixDeviceDependentCode< DeviceType >;
		friend class tnlChunkedEllpackMatrix< RealType, tnlHost, IndexType >;
		friend class tnlChunkedEllpackMatrix< RealType, tnlCuda, IndexType >;

		/*#ifdef HAVE_CUDA
		friend void tnlChunkedEllpackMatrix_setSlices_CudaKernel< Real, Index, 256 >( tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const RowLengthsVector* rowLengths,
		const Index numberOfSlices,
		Index* elementsToAllocation,
		const Index gridIdx );
		#endif*/


		#ifdef HAVE_CUDA
		template< typename Vector >
		friend void tnlChunkedEllpackMatrixVectorProductCudaKernel( const tnlChunkedEllpackMatrix< Real, tnlCuda, Index >* matrix,
		const Vector* inVector,
		Vector* outVector,
		int gridIdx );
		#endif
		};

		#include <implementation/matrices/tnlChunkedEllpackMatrix_impl.h>

src/matrices/tnlSparseMatrix.h

+3 −0

Original line number	Diff line number	Diff line
		@@ -49,10 +49,13 @@ class tnlSparseMatrix : public tnlMatrix< Real, Device, Index >

		bool load( tnlFile& file );

		void printStructure( ostream& str ) const;

		protected:

		bool allocateMatrixElements( const IndexType& numberOfMatrixElements );


		tnlVector< Index, Device, Index > columnIndexes;
		};

tests/unit-tests/matrices/tnlChunkedEllpackMatrixTest.cpp

+1 −1

Original line number	Diff line number	Diff line
		@@ -27,7 +27,7 @@
		int main( int argc, char* argv[] )
		{
		tnlSparseMatrixTester< tnlChunkedEllpackMatrix< float, tnlHost, int >, tnlChunkedEllpackMatrixTestSetup< 4, 2 > > tester;
		//tester.setElementTest();
		//tester. vectorProduct_DiagonalMatrixTest();

		#ifdef HAVE_CPPUNIT
		if( ! tnlUnitTestStarter :: run< tnlSparseMatrixTester< tnlChunkedEllpackMatrix< float, tnlHost, int >, tnlChunkedEllpackMatrixTestSetup< 4, 2 > > >() \|\|